Commit f1ebdd60 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
  Add _addr_lsb field to ia64 siginfo
  Fix migration.c compilation on s390
  HWPOISON: Remove retry loop for try_to_unmap
  HWPOISON: Turn addr_valid from bitfield into char
  HWPOISON: Disable DEBUG by default
  HWPOISON: Convert pr_debugs to pr_info
  HWPOISON: Improve comments in memory-failure.c
  x86: HWPOISON: Report correct address granuality for huge hwpoison faults
  Encode huge page size for VM_FAULT_HWPOISON errors
  Fix build error with !CONFIG_MIGRATION
  hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
  Clean up __page_set_anon_rmap
  HWPOISON, hugetlb: fix unpoison for hugepage
  HWPOISON, hugetlb: soft offlining for hugepage
  HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
  hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
  HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
  hugetlb: hugepage migration core
  hugetlb: redefine hugepage copy functions
  hugetlb: add allocate function for hugepage migration
  ...
parents f99d0553 46e387bb
...@@ -62,6 +62,7 @@ typedef struct siginfo { ...@@ -62,6 +62,7 @@ typedef struct siginfo {
int _imm; /* immediate value for "break" */ int _imm; /* immediate value for "break" */
unsigned int _flags; /* see below */ unsigned int _flags; /* see below */
unsigned long _isr; /* isr */ unsigned long _isr; /* isr */
short _addr_lsb; /* lsb of faulting address */
} _sigfault; } _sigfault;
/* SIGPOLL */ /* SIGPOLL */
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */ #include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */
...@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) ...@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
static void static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk) struct task_struct *tsk, int fault)
{ {
unsigned lsb = 0;
siginfo_t info; siginfo_t info;
info.si_signo = si_signo; info.si_signo = si_signo;
info.si_errno = 0; info.si_errno = 0;
info.si_code = si_code; info.si_code = si_code;
info.si_addr = (void __user *)address; info.si_addr = (void __user *)address;
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
if (fault & VM_FAULT_HWPOISON)
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
force_sig_info(si_signo, &info, tsk); force_sig_info(si_signo, &info, tsk);
} }
...@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ...@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14; tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk); force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
return; return;
} }
...@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, ...@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.trap_no = 14; tsk->thread.trap_no = 14;
#ifdef CONFIG_MEMORY_FAILURE #ifdef CONFIG_MEMORY_FAILURE
if (fault & VM_FAULT_HWPOISON) { if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
printk(KERN_ERR printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address); tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR; code = BUS_MCEERR_AR;
} }
#endif #endif
force_sig_info_fault(SIGBUS, code, address, tsk); force_sig_info_fault(SIGBUS, code, address, tsk, fault);
} }
static noinline void static noinline void
...@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, ...@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) { if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address); out_of_memory(regs, error_code, address);
} else { } else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault); do_sigbus(regs, error_code, address, fault);
else else
BUG(); BUG();
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/statfs.h> #include <linux/statfs.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/migrate.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) ...@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
return 0; return 0;
} }
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int rc;
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
migrate_page_copy(newpage, page);
return 0;
}
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{ {
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
...@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { ...@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin, .write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end, .write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty, .set_page_dirty = hugetlbfs_set_page_dirty,
.migratepage = hugetlbfs_migrate_page,
}; };
......
...@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, ...@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
#ifdef __ARCH_SI_TRAPNO #ifdef __ARCH_SI_TRAPNO
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
#endif
#ifdef BUS_MCEERR_AO
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
if (kinfo->si_code == BUS_MCEERR_AR ||
kinfo->si_code == BUS_MCEERR_AO)
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif #endif
break; break;
case __SI_CHLD: case __SI_CHLD:
......
...@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, ...@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma, struct vm_area_struct *vma,
int acctflags); int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page); int dequeue_hwpoisoned_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable; extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity; extern const unsigned long hugetlb_zero, hugetlb_infinity;
...@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) ...@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0 #define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0 #define dequeue_hwpoisoned_huge_page(page) 0
static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
#define hugetlb_change_protection(vma, address, end, newprot) #define hugetlb_change_protection(vma, address, end, newprot)
...@@ -228,6 +232,8 @@ struct huge_bootmem_page { ...@@ -228,6 +232,8 @@ struct huge_bootmem_page {
struct hstate *hstate; struct hstate *hstate;
}; };
struct page *alloc_huge_page_node(struct hstate *h, int nid);
/* arch callback */ /* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h);
...@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page) ...@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
return size_to_hstate(PAGE_SIZE << compound_order(page)); return size_to_hstate(PAGE_SIZE << compound_order(page));
} }
static inline unsigned hstate_index_to_shift(unsigned index)
{
return hstates[index].order + PAGE_SHIFT;
}
#else #else
struct hstate {}; struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL #define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL #define hstate_file(f) NULL
#define hstate_vma(v) NULL #define hstate_vma(v) NULL
...@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) ...@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
{ {
return 1; return 1;
} }
#define hstate_index_to_shift(index) 0
#endif #endif
#endif /* _LINUX_HUGETLB_H */ #endif /* _LINUX_HUGETLB_H */
...@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, ...@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
struct page *, struct page *); struct page *, struct page *);
extern int migrate_pages(struct list_head *l, new_page_t x, extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining); unsigned long private, int offlining);
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining);
extern int fail_migrate_page(struct address_space *, extern int fail_migrate_page(struct address_space *,
struct page *, struct page *); struct page *, struct page *);
...@@ -23,12 +25,17 @@ extern int migrate_prep_local(void); ...@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
extern int migrate_vmas(struct mm_struct *mm, extern int migrate_vmas(struct mm_struct *mm,
const nodemask_t *from, const nodemask_t *to, const nodemask_t *from, const nodemask_t *to,
unsigned long flags); unsigned long flags);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
#else #else
#define PAGE_MIGRATION 0 #define PAGE_MIGRATION 0
static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x, static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; } unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; }
...@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, ...@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
return -ENOSYS; return -ENOSYS;
} }
static inline void migrate_page_copy(struct page *newpage,
struct page *page) {}
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
return -ENOSYS;
}
/* Possible settings for the migrate_page() method in address_operations */ /* Possible settings for the migrate_page() method in address_operations */
#define migrate_page NULL #define migrate_page NULL
#define fail_migrate_page NULL #define fail_migrate_page NULL
......
...@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) ...@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE)
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
/* /*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM. * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
......
...@@ -33,6 +33,7 @@ struct signalfd_siginfo { ...@@ -33,6 +33,7 @@ struct signalfd_siginfo {
__u64 ssi_utime; __u64 ssi_utime;
__u64 ssi_stime; __u64 ssi_stime;
__u64 ssi_addr; __u64 ssi_addr;
__u16 ssi_addr_lsb;
/* /*
* Pad strcture to 128 bytes. Remember to update the * Pad strcture to 128 bytes. Remember to update the
...@@ -43,7 +44,7 @@ struct signalfd_siginfo { ...@@ -43,7 +44,7 @@ struct signalfd_siginfo {
* comes out of a read(2) and we really don't want to have * comes out of a read(2) and we really don't want to have
* a compat on read(2). * a compat on read(2).
*/ */
__u8 __pad[48]; __u8 __pad[46];
}; };
......
This diff is collapsed.
This diff is collapsed.
...@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_OOM) if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM; return i ? i : -ENOMEM;
if (ret & if (ret &
(VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
VM_FAULT_SIGBUS))
return i ? i : -EFAULT; return i ? i : -EFAULT;
BUG(); BUG();
} }
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/security.h> #include <linux/security.h>
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include "internal.h" #include "internal.h"
...@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ...@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte; pte_t *ptep, pte;
spinlock_t *ptl; spinlock_t *ptl;
pgd = pgd_offset(mm, addr); if (unlikely(PageHuge(new))) {
if (!pgd_present(*pgd)) ptep = huge_pte_offset(mm, addr);
goto out; if (!ptep)
goto out;
ptl = &mm->page_table_lock;
} else {
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, addr); pud = pud_offset(pgd, addr);
if (!pud_present(*pud)) if (!pud_present(*pud))
goto out; goto out;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd)) if (!pmd_present(*pmd))
goto out; goto out;
ptep = pte_offset_map(pmd, addr); ptep = pte_offset_map(pmd, addr);
if (!is_swap_pte(*ptep)) { if (!is_swap_pte(*ptep)) {
pte_unmap(ptep); pte_unmap(ptep);
goto out; goto out;
} }
ptl = pte_lockptr(mm, pmd);
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); spin_lock(ptl);
pte = *ptep; pte = *ptep;
if (!is_swap_pte(pte)) if (!is_swap_pte(pte))
...@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ...@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry)) if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte); pte = pte_mkwrite(pte);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new))
pte = pte_mkhuge(pte);
#endif
flush_cache_page(vma, addr, pte_pfn(pte)); flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte); set_pte_at(mm, addr, ptep, pte);
if (PageAnon(new)) if (PageHuge(new)) {
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
page_dup_rmap(new);
} else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr); page_add_anon_rmap(new, vma, addr);
else else
page_add_file_rmap(new); page_add_file_rmap(new);
...@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping, ...@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
return 0; return 0;
} }
/*
* The expected number of remaining references is the same as that
* of migrate_page_move_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int expected_count;
void **pslot;
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
return 0;
}
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
(struct page *)radix_tree_deref_slot(pslot) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
if (!page_freeze_refs(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
page_unfreeze_refs(page, expected_count);
__put_page(page);
spin_unlock_irq(&mapping->tree_lock);
return 0;
}
/* /*
* Copy the page to its new location * Copy the page to its new location
*/ */
static void migrate_page_copy(struct page *newpage, struct page *page) void migrate_page_copy(struct page *newpage, struct page *page)
{ {
copy_highpage(newpage, page); if (PageHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
if (PageError(page)) if (PageError(page))
SetPageError(newpage); SetPageError(newpage);
...@@ -723,6 +789,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, ...@@ -723,6 +789,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
return rc; return rc;
} }
/*
* Counterpart of unmap_and_move_page() for hugepage migration.
*
* This function doesn't wait the completion of hugepage I/O
* because there is no race between I/O and migration for hugepage.
* Note that currently hugepage I/O occurs only in direct I/O
* where no lock is held and PG_writeback is irrelevant,
* and writeback status of all subpages are counted in the reference
* count of the head page (i.e. if all subpages of a 2MB hugepage are
* under direct I/O, the reference of the head page is 512 and a bit more.)
* This means that when we try to migrate hugepage whose subpages are
* doing direct I/O, some references remain after try_to_unmap() and
* hugepage migration fails without data corruption.
*
* There is also no race when direct I/O is issued on the page under migration,
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
int force, int offlining)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
return -ENOMEM;
rc = -EAGAIN;
if (!trylock_page(hpage)) {
if (!force)
goto out;
lock_page(hpage);
}
if (PageAnon(hpage)) {
rcu_read_lock();
rcu_locked = 1;
if (page_mapped(hpage)) {
anon_vma = page_anon_vma(hpage);
atomic_inc(&anon_vma->external_refcount);
}
}
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1);
if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
&anon_vma->lock)) {
int empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
}
if (rcu_locked)
rcu_read_unlock();
out:
unlock_page(hpage);
if (rc != -EAGAIN) {
list_del(&hpage->lru);
put_page(hpage);
}
put_page(new_hpage);
if (result) {
if (rc)
*result = rc;
else
*result = page_to_nid(new_hpage);
}
return rc;
}
/* /*
* migrate_pages * migrate_pages
* *
...@@ -788,6 +940,52 @@ int migrate_pages(struct list_head *from, ...@@ -788,6 +940,52 @@ int migrate_pages(struct list_head *from,
return nr_failed + retry; return nr_failed + retry;
} }
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, int offlining)
{
int retry = 1;
int nr_failed = 0;
int pass = 0;
struct page *page;
struct page *page2;
int rc;
for (pass = 0; pass < 10 && retry; pass++) {
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining);
switch(rc) {
case -ENOMEM:
goto out;
case -EAGAIN:
retry++;
break;
case 0:
break;
default:
/* Permanent failure */
nr_failed++;
break;
}
}
}
rc = 0;
out:
list_for_each_entry_safe(page, page2, from, lru)
put_page(page);
if (rc)
return rc;
return nr_failed + retry;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
* Move a list of individual pages * Move a list of individual pages
......
...@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, ...@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
} }
/** /**
* __page_set_anon_rmap - setup new anonymous rmap * __page_set_anon_rmap - set up new anonymous rmap
* @page: the page to add the mapping to * @page: Page to add to rmap
* @vma: the vm area in which the mapping is added * @vma: VM area to add page to.
* @address: the user virtual address mapped * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process * @exclusive: the page is exclusively owned by the current process
*/ */
static void __page_set_anon_rmap(struct page *page, static void __page_set_anon_rmap(struct page *page,
...@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, ...@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma); BUG_ON(!anon_vma);
if (PageAnon(page))
return;
/* /*
* If the page isn't exclusively mapped into this vma, * If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the * we must use the _oldest_ possible anon_vma for the
* page mapping! * page mapping!
*/ */
if (!exclusive) { if (!exclusive)
if (PageAnon(page))
return;
anon_vma = anon_vma->root; anon_vma = anon_vma->root;
} else {
/*
* In this case, swapped-out-but-not-discarded swap-cache
* is remapped. So, no need to update page->mapping here.
* We convice anon_vma poitned by page->mapping is not obsolete
* because vma->anon_vma is necessary to be a family of it.
*/
if (PageAnon(page))
return;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma; page->mapping = (struct address_space *) anon_vma;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment