Commit eefb08ee authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Infrastructure for correct hugepage refcounting

We currently have a problem when things like ptrace, futexes and direct-io
try to pin user pages.  If the user's address is in a huge page we're
elevting the refcount of a constituent 4k page, not the head page of the
high-order allocation unit.

To solve this, a generic way of handling higher-order pages has been
implemented:

- A higher-order page is called a "compound page".  Chose this because
  "huge page", "large page", "super page", etc all seem to mean different
  things to different people.

- The first (controlling) 4k page of a compound page is referred to as the
  "head" page.

- The remaining pages are tail pages.

All pages have PG_compound set.  All pages have their lru.next pointing at
the head page (even the head page has this).

The head page's lru.prev, if non-zero, holds the address of the compound
page's put_page() function.

The order of the allocation is stored in the first tail page's lru.prev.
This is only for debug at present.  This usage means that zero-order pages
may not be compound.

The above relationships are established for _all_ higher-order pages in the
page allocator.  Which has some cost, but not much - another atomic op during
fork(), mainly.

This functionality is only enabled if CONFIG_HUGETLB_PAGE, although it could
be turned on permanently.  There's a little extra cost in get_page/put_page.

These changes do not preclude adding compound pages to the LRU in the future
- we can add a new page flag to the head page and then move all the
additional data to the first tail page's lru.next, lru.prev, list.next,
list.prev, index, private, etc.
parent 6725839b
......@@ -208,24 +208,55 @@ struct page {
* Also, many kernel routines increase the page count before a critical
* routine so they can be sure the page doesn't go away from under them.
*/
#define get_page(p) atomic_inc(&(p)->count)
#define __put_page(p) atomic_dec(&(p)->count)
#define put_page_testzero(p) \
({ \
BUG_ON(page_count(page) == 0); \
atomic_dec_and_test(&(p)->count); \
})
#define page_count(p) atomic_read(&(p)->count)
#define set_page_count(p,v) atomic_set(&(p)->count, v)
#define __put_page(p) atomic_dec(&(p)->count)
extern void FASTCALL(__page_cache_release(struct page *));
#ifdef CONFIG_HUGETLB_PAGE
static inline void get_page(struct page *page)
{
if (PageCompound(page))
page = (struct page *)page->lru.next;
atomic_inc(&page->count);
}
static inline void put_page(struct page *page)
{
if (PageCompound(page)) {
page = (struct page *)page->lru.next;
if (page->lru.prev) { /* destructor? */
(*(void (*)(struct page *))page->lru.prev)(page);
return;
}
}
if (!PageReserved(page) && put_page_testzero(page))
__page_cache_release(page);
}
#else /* CONFIG_HUGETLB_PAGE */
static inline void get_page(struct page *page)
{
atomic_inc(&page->count);
}
static inline void put_page(struct page *page)
{
if (!PageReserved(page) && put_page_testzero(page))
__page_cache_release(page);
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
......
......@@ -72,7 +72,8 @@
#define PG_direct 16 /* ->pte_chain points directly at pte */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be recalimed asap */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
......@@ -251,6 +252,10 @@ extern void get_full_page_state(struct page_state *ret);
#define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags)
#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
#define PageCompound(page) test_bit(PG_compound, &(page)->flags)
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
......
......@@ -85,6 +85,62 @@ static void bad_page(const char *function, struct page *page)
page->mapping = NULL;
}
#ifndef CONFIG_HUGETLB_PAGE
#define prep_compound_page(page, order) do { } while (0)
#define destroy_compound_page(page, order) do { } while (0)
#else
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page".
*
* The remaining PAGE_SIZE pages are called "tail pages".
*
* All pages have PG_compound set. All pages have their lru.next pointing at
* the head page (even the head page has this).
*
* The head page's lru.prev, if non-zero, holds the address of the compound
* page's put_page() function.
*
* The order of the allocation is stored in the first tail page's lru.prev.
* This is only for debug at present. This usage means that zero-order pages
* may not be compound.
*/
static void prep_compound_page(struct page *page, int order)
{
int i;
int nr_pages = 1 << order;
page->lru.prev = NULL;
page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
SetPageCompound(p);
p->lru.next = (void *)page;
}
}
static void destroy_compound_page(struct page *page, int order)
{
int i;
int nr_pages = 1 << order;
if (page[1].lru.prev != (void *)order)
bad_page(__FUNCTION__, page);
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
if (!PageCompound(p))
bad_page(__FUNCTION__, page);
if (p->lru.next != (void *)page)
bad_page(__FUNCTION__, page);
ClearPageCompound(p);
}
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Freeing function for a buddy system allocator.
*
......@@ -114,6 +170,8 @@ static inline void __free_pages_bulk (struct page *page, struct page *base,
{
unsigned long page_idx, index;
if (order)
destroy_compound_page(page, order);
page_idx = page - base;
if (page_idx & ~mask)
BUG();
......@@ -409,6 +467,12 @@ void free_cold_page(struct page *page)
free_hot_cold_page(page, 1);
}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
{
unsigned long flags;
......@@ -435,6 +499,8 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
if (order && page)
prep_compound_page(page, order);
}
if (page != NULL) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment