Commit f55f0501 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

x86/pti: Put the LDT in its own PGD if PTI is on

With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
The LDT is per process, i.e. per mm.

An earlier approach mapped the LDT on context switch into a fixmap area,
but that's a big overhead and exhausted the fixmap space when NR_CPUS got
big.

Take advantage of the fact that there is an address space hole which
provides a completely unused pgd. Use this pgd to manage per-mm LDT
mappings.

This has a down side: the LDT isn't (currently) randomized, and an attack
that can write the LDT is instant root due to call gates (thanks, AMD, for
leaving call gates in AMD64 but designing them wrong so they're only useful
for exploits).  This can be mitigated by making the LDT read-only or
randomizing the mapping, either of which is strightforward on top of this
patch.

This will significantly slow down LDT users, but that shouldn't matter for
important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
old libc implementations.

[ tglx: Cleaned it up. ]
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 9f449772
...@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ...@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ... ... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ... ... unused hole ...
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ... ... unused hole ...
...@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables: ...@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff9fffffffffffff (=52 bits) hole ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
......
...@@ -50,10 +50,33 @@ struct ldt_struct { ...@@ -50,10 +50,33 @@ struct ldt_struct {
* call gates. On native, we could merge the ldt_struct and LDT * call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize. * allocations, but it's not worth trying to optimize.
*/ */
struct desc_struct *entries; struct desc_struct *entries;
unsigned int nr_entries; unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
}; };
/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
static inline void *ldt_slot_va(int slot)
{
#ifdef CONFIG_X86_64
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
#else
BUG();
#endif
}
/* /*
* Used for LDT copy/destruction. * Used for LDT copy/destruction.
*/ */
...@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) ...@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
} }
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */ #else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm, static inline int ldt_dup_context(struct mm_struct *oldmm,
...@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, ...@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
{ {
return 0; return 0;
} }
static inline void destroy_context_ldt(struct mm_struct *mm) {} static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif #endif
static inline void load_mm_ldt(struct mm_struct *mm) static inline void load_mm_ldt(struct mm_struct *mm)
...@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) ...@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see. * that we can see.
*/ */
if (unlikely(ldt)) if (unlikely(ldt)) {
set_ldt(ldt->entries, ldt->nr_entries); if (static_cpu_has(X86_FEATURE_PTI)) {
else if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
/*
* Whoops -- either the new LDT isn't mapped
* (if slot == -1) or is mapped into a bogus
* slot (if slot > 1).
*/
clear_LDT();
return;
}
/*
* If page table isolation is enabled, ldt->entries
* will not be mapped in the userspace pagetables.
* Tell the CPU to access the LDT through the alias
* at ldt_slot_va(ldt->slot).
*/
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
} else {
set_ldt(ldt->entries, ldt->nr_entries);
}
} else {
clear_LDT(); clear_LDT();
}
#else #else
clear_LDT(); clear_LDT();
#endif #endif
...@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) ...@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static inline void arch_exit_mmap(struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm)
{ {
paravirt_arch_exit_mmap(mm); paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
......
...@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t; ...@@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t;
# define VMALLOC_SIZE_TB _AC(12800, UL) # define VMALLOC_SIZE_TB _AC(12800, UL)
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
# define LDT_PGD_ENTRY _AC(-112, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else #else
# define VMALLOC_SIZE_TB _AC(32, UL) # define VMALLOC_SIZE_TB _AC(32, UL)
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
# define LDT_PGD_ENTRY _AC(-4, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif #endif
#ifdef CONFIG_RANDOMIZE_MEMORY #ifdef CONFIG_RANDOMIZE_MEMORY
......
...@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x) ...@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
#else #else
/* /*
* User space process size. 47bits minus one guard page. The guard * User space process size. This is the first address outside the user range.
* page is necessary on Intel CPUs: if a SYSCALL instruction is at * There are a few constraints that determine this:
* the highest possible canonical userspace address, then that *
* syscall will enter the kernel with a non-canonical return * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, and SYSRET will explode dangerously. We avoid this * address, then that syscall will enter the kernel with a
* particular problem by preventing anything from being mapped * non-canonical return address, and SYSRET will explode dangerously.
* at the maximum canonical address. * We avoid this particular problem by preventing anything executable
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/ */
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/ldt.h> #include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
...@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void) ...@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm) static void flush_ldt(void *__mm)
{ {
struct mm_struct *mm = __mm; struct mm_struct *mm = __mm;
mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return; return;
pc = &mm->context; load_mm_ldt(mm);
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
refresh_ldt_segments(); refresh_ldt_segments();
} }
...@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) ...@@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL; return NULL;
} }
/* The new LDT isn't aliased for PTI yet. */
new_ldt->slot = -1;
new_ldt->nr_entries = num_entries; new_ldt->nr_entries = num_entries;
return new_ldt; return new_ldt;
} }
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
*
* There is no corresponding unmap function. Even if the LDT is freed, we
* leave the PTEs around until the slot is reused or the mm is destroyed.
* This is harmless: the LDT is always in ordinary memory, and no one will
* access the freed slot.
*
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
* it useful, and the flush would slow down modify_ldt().
*/
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
bool is_vmalloc, had_top_level_entry;
unsigned long va;
spinlock_t *ptl;
pgd_t *pgd;
int i;
if (!static_cpu_has(X86_FEATURE_PTI))
return 0;
/*
* Any given ldt_struct should have map_ldt_struct() called at most
* once.
*/
WARN_ON(ldt->slot != -1);
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
unsigned long offset = i << PAGE_SHIFT;
const void *src = (char *)ldt->entries + offset;
unsigned long pfn;
pte_t pte, *ptep;
va = (unsigned long)ldt_slot_va(slot) + offset;
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
page_to_pfn(virt_to_page(src));
/*
* Treat the PTI LDT range as a *userspace* range.
* get_locked_pte() will allocate all needed pagetables
* and account for them in this mm.
*/
ptep = get_locked_pte(mm, va, &ptl);
if (!ptep)
return -ENOMEM;
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
set_pte_at(mm, va, ptep, pte);
pte_unmap_unlock(ptep, ptl);
}
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
}
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
#endif
return 0;
}
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = start + (1UL << PGDIR_SHIFT);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
tlb_gather_mmu(&tlb, mm, start, end);
free_pgd_range(&tlb, start, end, start, end);
tlb_finish_mmu(&tlb, start, end);
#endif
}
/* After calling this, the LDT is immutable. */ /* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt) static void finalize_ldt_struct(struct ldt_struct *ldt)
{ {
...@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) ...@@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE); new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
retval = map_ldt_struct(mm, new_ldt, 0);
if (retval) {
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
mm->context.ldt = new_ldt; mm->context.ldt = new_ldt;
out_unlock: out_unlock:
...@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm) ...@@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL; mm->context.ldt = NULL;
} }
void ldt_arch_exit_mmap(struct mm_struct *mm)
{
free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount) static int read_ldt(void __user *ptr, unsigned long bytecount)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
...@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ...@@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt; new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
/*
* If we are using PTI, map the new LDT into the userspace pagetables.
* If there is already an LDT, use the other slot so that other CPUs
* will continue to use the old LDT until install_ldt() switches
* them over to the new LDT.
*/
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
if (error) {
free_ldt_struct(old_ldt);
goto out_unlock;
}
install_ldt(mm, new_ldt); install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt); free_ldt_struct(old_ldt);
error = 0; error = 0;
......
...@@ -52,11 +52,17 @@ enum address_markers_idx { ...@@ -52,11 +52,17 @@ enum address_markers_idx {
USER_SPACE_NR = 0, USER_SPACE_NR = 0,
KERNEL_SPACE_NR, KERNEL_SPACE_NR,
LOW_KERNEL_NR, LOW_KERNEL_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
VMALLOC_START_NR, VMALLOC_START_NR,
VMEMMAP_START_NR, VMEMMAP_START_NR,
#ifdef CONFIG_KASAN #ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR, KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR, KASAN_SHADOW_END_NR,
#endif
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif #endif
CPU_ENTRY_AREA_NR, CPU_ENTRY_AREA_NR,
#ifdef CONFIG_X86_ESPFIX64 #ifdef CONFIG_X86_ESPFIX64
...@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = { ...@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_KASAN #ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
#endif #endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64 #ifdef CONFIG_X86_ESPFIX64
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment