Commit 17fae129 authored by Tony Luck's avatar Tony Luck Committed by Thomas Gleixner

x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned

An interesting thing happened when a guest Linux instance took a machine
check. The VMM unmapped the bad page from guest physical space and
passed the machine check to the guest.

Linux took all the normal actions to offline the page from the process
that was using it. But then guest Linux crashed because it said there
was a second machine check inside the kernel with this stack trace:

do_memory_failure
    set_mce_nospec
         set_memory_uc
              _set_memory_uc
                   change_page_attr_set_clr
                        cpa_flush
                             clflush_cache_range_opt

This was odd, because a CLFLUSH instruction shouldn't raise a machine
check (it isn't consuming the data). Further investigation showed that
the VMM had passed in another machine check because is appeared that the
guest was accessing the bad page.

Fix is to check the scope of the poison by checking the MCi_MISC register.
If the entire page is affected, then unmap the page. If only part of the
page is affected, then mark the page as uncacheable.

This assumes that VMMs will do the logical thing and pass in the "whole
page scope" via the MCi_MISC register (since they unmapped the entire
page).

  [ bp: Adjust to x86/entry changes. ]

Fixes: 284ce401 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reported-by: default avatarJue Wang <juew@google.com>
Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Tested-by: default avatarJue Wang <juew@google.com>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel.com

parent f77d26a9
...@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page); ...@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly; extern int kernel_set_to_readonly;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static inline int set_mce_nospec(unsigned long pfn) /*
* Prevent speculative access to the page by either unmapping
* it (if we do not require access to any part of the page) or
* marking it uncacheable (if we want to try to retrieve data
* from non-poisoned lines in the page).
*/
static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{ {
unsigned long decoy_addr; unsigned long decoy_addr;
int rc; int rc;
/* /*
* Mark the linear address as UC to make sure we don't log more
* errors because of speculative access to the page.
* We would like to just call: * We would like to just call:
* set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1); * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a * but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have * speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting * the virtual address of the kernel 1:1 mapping sitting
* around in registers. * around in registers.
* Instead we get tricky. We create a non-canonical address * Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped. * that looks just like the one we want, but has bit 63 flipped.
* This relies on set_memory_uc() properly sanitizing any __pa() * This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK. * results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/ */
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
rc = set_memory_uc(decoy_addr, 1); if (unmap)
rc = set_memory_np(decoy_addr, 1);
else
rc = set_memory_uc(decoy_addr, 1);
if (rc) if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc; return rc;
......
...@@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m) ...@@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m)
} }
EXPORT_SYMBOL_GPL(mce_is_memory_error); EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool whole_page(struct mce *m)
{
if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
return true;
return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
}
bool mce_is_correctable(struct mce *m) bool mce_is_correctable(struct mce *m)
{ {
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
...@@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, ...@@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT; pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) { if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn); set_mce_nospec(pfn, whole_page(mce));
mce->kflags |= MCE_HANDLED_UC; mce->kflags |= MCE_HANDLED_UC;
} }
...@@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb) ...@@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED; int flags = MF_ACTION_REQUIRED;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!(p->mce_status & MCG_STATUS_RIPV))
if (!p->mce_ripv)
flags |= MF_MUST_KILL; flags |= MF_MUST_KILL;
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT); set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
return; return;
} }
...@@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs) ...@@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs)
BUG_ON(!on_thread_stack() || !user_mode(regs)); BUG_ON(!on_thread_stack() || !user_mode(regs));
current->mce_addr = m.addr; current->mce_addr = m.addr;
current->mce_status = m.mcgstatus; current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(&m);
current->mce_kill_me.func = kill_me_maybe; current->mce_kill_me.func = kill_me_maybe;
if (kill_it) if (kill_it)
current->mce_kill_me.func = kill_me_now; current->mce_kill_me.func = kill_me_now;
......
...@@ -1304,7 +1304,9 @@ struct task_struct { ...@@ -1304,7 +1304,9 @@ struct task_struct {
#ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_MCE
u64 mce_addr; u64 mce_addr;
u64 mce_status; __u64 mce_ripv : 1,
mce_whole_page : 1,
__mce_reserved : 62;
struct callback_head mce_kill_me; struct callback_head mce_kill_me;
#endif #endif
......
...@@ -26,7 +26,7 @@ static inline int set_direct_map_default_noflush(struct page *page) ...@@ -26,7 +26,7 @@ static inline int set_direct_map_default_noflush(struct page *page)
#endif #endif
#ifndef set_mce_nospec #ifndef set_mce_nospec
static inline int set_mce_nospec(unsigned long pfn) static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{ {
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment