Commit db168263 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (21 commits)
  HWPOISON: Enable error_remove_page on btrfs
  HWPOISON: Add simple debugfs interface to inject hwpoison on arbitary PFNs
  HWPOISON: Add madvise() based injector for hardware poisoned pages v4
  HWPOISON: Enable error_remove_page for NFS
  HWPOISON: Enable .remove_error_page for migration aware file systems
  HWPOISON: The high level memory error handler in the VM v7
  HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process
  HWPOISON: shmem: call set_page_dirty() with locked page
  HWPOISON: Define a new error_remove_page address space op for async truncation
  HWPOISON: Add invalidate_inode_page
  HWPOISON: Refactor truncate to allow direct truncating of page v2
  HWPOISON: check and isolate corrupted free pages v2
  HWPOISON: Handle hardware poisoned pages in try_to_unmap
  HWPOISON: Use bitmask/action code for try_to_unmap behaviour
  HWPOISON: x86: Add VM_FAULT_HWPOISON handling to x86 page fault handler v2
  HWPOISON: Add poison check to page fault handling
  HWPOISON: Add basic support for poisoned pages in fault handler v3
  HWPOISON: Add new SIGBUS error codes for hardware poison signals
  HWPOISON: Add support for poison swap entries v2
  HWPOISON: Export some rmap vma locking to outside world
  ...
parents cd604513 465fdd97
......@@ -536,6 +536,7 @@ struct address_space_operations {
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
};
writepage: called by the VM to write a dirty page to backing store.
......@@ -694,6 +695,12 @@ struct address_space_operations {
prevent redirtying the page, it is kept locked during the whole
operation.
error_remove_page: normally set to generic_error_remove_page if truncation
is ok for this address space. Used for memory failure handling.
Setting this implies you deal with pages going away under you,
unless you have them locked or reference counts increased.
The File Object
===============
......
......@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
- legacy_va_layout
- lowmem_reserve_ratio
- max_map_count
- memory_failure_early_kill
- memory_failure_recovery
- min_free_kbytes
- min_slab_ratio
- min_unmapped_ratio
......@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
- vfs_cache_pressure
- zone_reclaim_mode
==============================================================
block_dump
......@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
The default value is 65536.
=============================================================
memory_failure_early_kill:
Control how to kill processes when uncorrected memory error (typically
a 2bit error in a memory module) is detected in the background by hardware
that cannot be handled by the kernel. In some cases (like the page
still having a valid copy on disk) the kernel will handle the failure
transparently without affecting any applications. But if there is
no other uptodate copy of the data it will kill to prevent any data
corruptions from propagating.
1: Kill all processes that have the corrupted and not reloadable page mapped
as soon as the corruption is detected. Note this is not supported
for a few types of pages, like kernel internally allocated data or
the swap cache, but works for the majority of user pages.
0: Only unmap the corrupted page from all processes and only kill a process
who tries to access it.
The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
handle this if they want to.
This is only active on architectures/platforms with advanced machine
check handling and depends on the hardware capabilities.
Applications can override this setting individually with the PR_MCE_KILL prctl
==============================================================
memory_failure_recovery
Enable memory failure recovery (when supported by the platform)
1: Attempt recovery.
0: Always panic on a memory failure.
==============================================================
min_free_kbytes:
......
......@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
force_sig_info(si_signo, &info, tsk);
}
......@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
int code = BUS_ADRERR;
up_read(&mm->mmap_sem);
......@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
#ifdef CONFIG_MEMORY_FAILURE
if (fault & VM_FAULT_HWPOISON) {
printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR;
}
#endif
force_sig_info_fault(SIGBUS, code, address, tsk);
}
static noinline void
......@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address);
} else {
if (fault & VM_FAULT_SIGBUS)
do_sigbus(regs, error_code, address);
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
do_sigbus(regs, error_code, address, fault);
else
BUG();
}
......
......@@ -5269,6 +5269,7 @@ static const struct address_space_operations btrfs_aops = {
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations btrfs_symlink_aops = {
......
......@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
const struct address_space_operations ext2_aops_xip = {
......@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
.error_remove_page = generic_error_remove_page,
};
/*
......
......@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_writeback_aops = {
......@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext3_journalled_aops = {
......@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
void ext3_set_aops(struct inode *inode)
......
......@@ -3386,6 +3386,7 @@ static const struct address_space_operations ext4_ordered_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_writeback_aops = {
......@@ -3401,6 +3402,7 @@ static const struct address_space_operations ext4_writeback_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_journalled_aops = {
......@@ -3415,6 +3417,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations ext4_da_aops = {
......@@ -3431,6 +3434,7 @@ static const struct address_space_operations ext4_da_aops = {
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
void ext4_set_aops(struct inode *inode)
......
......@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_ordered_aops = {
......@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
.direct_IO = gfs2_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static const struct address_space_operations gfs2_jdata_aops = {
......@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
void gfs2_set_aops(struct inode *inode)
......
......@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
.direct_IO = nfs_direct_IO,
.migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
.error_remove_page = generic_error_remove_page,
};
/*
......
......@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
.error_remove_page = generic_error_remove_page,
};
/**
......@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
.migratepage = buffer_migrate_page, /* Move a page cache page from
one physical page to an
other. */
.error_remove_page = generic_error_remove_page,
};
#ifdef NTFS_RW
......
......@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
......@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
"VmallocChunk: %8lu kB\n",
"VmallocChunk: %8lu kB\n"
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %8lu kB\n"
#endif
,
K(i.totalram),
K(i.freeram),
K(i.bufferram),
......@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
#endif
);
hugetlb_report_meminfo(m);
......
......@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
......@@ -34,6 +34,7 @@
#define MADV_REMOVE 9 /* remove these pages & resources */
#define MADV_DONTFORK 10 /* don't inherit across fork */
#define MADV_DOFORK 11 /* do inherit across fork */
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
......
......@@ -82,6 +82,7 @@ typedef struct siginfo {
#ifdef __ARCH_SI_TRAPNO
int _trapno; /* TRAP # which caused the signal */
#endif
short _addr_lsb; /* LSB of the reported address */
} _sigfault;
/* SIGPOLL */
......@@ -112,6 +113,7 @@ typedef struct siginfo {
#ifdef __ARCH_SI_TRAPNO
#define si_trapno _sifields._sigfault._trapno
#endif
#define si_addr_lsb _sifields._sigfault._addr_lsb
#define si_band _sifields._sigpoll._band
#define si_fd _sifields._sigpoll._fd
......@@ -192,7 +194,11 @@ typedef struct siginfo {
#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */
#define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */
#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */
#define NSIGBUS 3
/* hardware memory error consumed on a machine check: action required */
#define BUS_MCEERR_AR (__SI_FAULT|4)
/* hardware memory error detected in process but not consumed: action optional*/
#define BUS_MCEERR_AO (__SI_FAULT|5)
#define NSIGBUS 5
/*
* SIGTRAP si_codes
......
......@@ -595,6 +595,7 @@ struct address_space_operations {
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
};
/*
......
......@@ -695,11 +695,12 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS)
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
/*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
......@@ -794,6 +795,11 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
extern int vmtruncate(struct inode * inode, loff_t offset);
extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
int truncate_inode_page(struct address_space *mapping, struct page *page);
int generic_error_remove_page(struct address_space *mapping, struct page *page);
int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
......@@ -1308,5 +1314,12 @@ void vmemmap_populate_print_last(void);
extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
size_t size);
extern void refund_locked_memory(struct mm_struct *mm, size_t size);
extern void memory_failure(unsigned long pfn, int trapno);
extern int __memory_failure(unsigned long pfn, int trapno, int ref);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern atomic_long_t mce_bad_pages;
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
......@@ -51,6 +51,9 @@
* PG_buddy is set to indicate that the page is free and in the buddy system
* (see mm/page_alloc.c).
*
* PG_hwpoison indicates that a page got corrupted in hardware and contains
* data with incorrect ECC bits that triggered a machine check. Accessing is
* not safe since it may cause another machine check. Don't touch!
*/
/*
......@@ -101,6 +104,9 @@ enum pageflags {
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
__NR_PAGEFLAGS,
......@@ -269,6 +275,15 @@ PAGEFLAG(Uncached, uncached)
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison)
TESTSETFLAG(HWPoison, hwpoison)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
static inline int PageUptodate(struct page *page)
{
int ret = test_bit(PG_uptodate, &(page)->flags);
......@@ -393,7 +408,7 @@ static inline void __ClearPageTail(struct page *page)
1 << PG_private | 1 << PG_private_2 | \
1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
1 << PG_unevictable | __PG_MLOCKED)
1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
/*
* Flags checked when a page is prepped for return by the page allocator.
......
......@@ -88,4 +88,6 @@
#define PR_TASK_PERF_EVENTS_DISABLE 31
#define PR_TASK_PERF_EVENTS_ENABLE 32
#define PR_MCE_KILL 33
#endif /* _LINUX_PRCTL_H */
......@@ -81,7 +81,19 @@ static inline void page_dup_rmap(struct page *page)
*/
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *cnt, unsigned long *vm_flags);
int try_to_unmap(struct page *, int ignore_refs);
enum ttu_flags {
TTU_UNMAP = 0, /* unmap mode */
TTU_MIGRATION = 1, /* migration mode */
TTU_MUNLOCK = 2, /* munlock mode */
TTU_ACTION_MASK = 0xff,
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
};
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
int try_to_unmap(struct page *, enum ttu_flags flags);
/*
* Called from mm/filemap_xip.c to unmap empty zero page
......@@ -108,6 +120,13 @@ int page_mkclean(struct page *);
*/
int try_to_munlock(struct page *);
/*
* Called by memory-failure.c to kill processes.
*/
struct anon_vma *page_lock_anon_vma(struct page *page);
void page_unlock_anon_vma(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
#else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment