Commit a36c14ba authored by Gerald Schaefer's avatar Gerald Schaefer Committed by Sasha Levin

s390/mm: fix asce_bits handling with dynamic pagetable levels

[ Upstream commit 723cacbd ]

There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.

Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.

Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: default avatarMichael Munday <munday@ca.ibm.com>
Reviewed-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: default avatarGerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: default avatarSasha Levin <sasha.levin@oracle.com>
parent 90eb6718
...@@ -11,7 +11,7 @@ typedef struct { ...@@ -11,7 +11,7 @@ typedef struct {
spinlock_t list_lock; spinlock_t list_lock;
struct list_head pgtable_list; struct list_head pgtable_list;
struct list_head gmap_list; struct list_head gmap_list;
unsigned long asce_bits; unsigned long asce;
unsigned long asce_limit; unsigned long asce_limit;
unsigned long vdso_base; unsigned long vdso_base;
/* The mmu context allocates 4K page tables. */ /* The mmu context allocates 4K page tables. */
......
...@@ -26,12 +26,28 @@ static inline int init_new_context(struct task_struct *tsk, ...@@ -26,12 +26,28 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0; mm->context.has_pgste = 0;
mm->context.use_skey = 0; mm->context.use_skey = 0;
#endif #endif
if (mm->context.asce_limit == 0) { switch (mm->context.asce_limit) {
case 1UL << 42:
/*
* forked 3-level task, fall through to set new asce with new
* mm->pgd
*/
case 0:
/* context created by exec, set asce limit to 4TB */ /* context created by exec, set asce limit to 4TB */
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce_limit = STACK_TOP_MAX;
} else if (mm->context.asce_limit == (1UL << 31)) { mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
break;
case 1UL << 53:
/* forked 4-level task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
break;
case 1UL << 31:
/* forked 2-level compat task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
/* pgd_alloc() did not increase mm->nr_pmds */
mm_inc_nr_pmds(mm); mm_inc_nr_pmds(mm);
} }
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
...@@ -42,7 +58,7 @@ static inline int init_new_context(struct task_struct *tsk, ...@@ -42,7 +58,7 @@ static inline int init_new_context(struct task_struct *tsk,
static inline void set_user_asce(struct mm_struct *mm) static inline void set_user_asce(struct mm_struct *mm)
{ {
S390_lowcore.user_asce = mm->context.asce_bits | __pa(mm->pgd); S390_lowcore.user_asce = mm->context.asce;
if (current->thread.mm_segment.ar4) if (current->thread.mm_segment.ar4)
__ctl_load(S390_lowcore.user_asce, 7, 7); __ctl_load(S390_lowcore.user_asce, 7, 7);
set_cpu_flag(CIF_ASCE); set_cpu_flag(CIF_ASCE);
...@@ -71,7 +87,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ...@@ -71,7 +87,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
S390_lowcore.user_asce = next->context.asce_bits | __pa(next->pgd); S390_lowcore.user_asce = next->context.asce;
if (prev == next) if (prev == next)
return; return;
if (MACHINE_HAS_TLB_LC) if (MACHINE_HAS_TLB_LC)
......
...@@ -56,8 +56,8 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) ...@@ -56,8 +56,8 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm)
return _REGION2_ENTRY_EMPTY; return _REGION2_ENTRY_EMPTY;
} }
int crst_table_upgrade(struct mm_struct *, unsigned long limit); int crst_table_upgrade(struct mm_struct *);
void crst_table_downgrade(struct mm_struct *, unsigned long limit); void crst_table_downgrade(struct mm_struct *);
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
{ {
......
...@@ -155,7 +155,7 @@ struct stack_frame { ...@@ -155,7 +155,7 @@ struct stack_frame {
regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \
regs->psw.addr = new_psw | PSW_ADDR_AMODE; \ regs->psw.addr = new_psw | PSW_ADDR_AMODE; \
regs->gprs[15] = new_stackp; \ regs->gprs[15] = new_stackp; \
crst_table_downgrade(current->mm, 1UL << 31); \ crst_table_downgrade(current->mm); \
execve_tail(); \ execve_tail(); \
} while (0) } while (0)
......
...@@ -110,8 +110,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) ...@@ -110,8 +110,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
static inline void __tlb_flush_kernel(void) static inline void __tlb_flush_kernel(void)
{ {
if (MACHINE_HAS_IDTE) if (MACHINE_HAS_IDTE)
__tlb_flush_idte((unsigned long) init_mm.pgd | __tlb_flush_idte(init_mm.context.asce);
init_mm.context.asce_bits);
else else
__tlb_flush_global(); __tlb_flush_global();
} }
...@@ -133,8 +132,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) ...@@ -133,8 +132,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
static inline void __tlb_flush_kernel(void) static inline void __tlb_flush_kernel(void)
{ {
if (MACHINE_HAS_TLB_LC) if (MACHINE_HAS_TLB_LC)
__tlb_flush_idte_local((unsigned long) init_mm.pgd | __tlb_flush_idte_local(init_mm.context.asce);
init_mm.context.asce_bits);
else else
__tlb_flush_local(); __tlb_flush_local();
} }
...@@ -148,8 +146,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm) ...@@ -148,8 +146,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm)
* only ran on the local cpu. * only ran on the local cpu.
*/ */
if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list)) if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list))
__tlb_flush_asce(mm, (unsigned long) mm->pgd | __tlb_flush_asce(mm, mm->context.asce);
mm->context.asce_bits);
else else
__tlb_flush_full(mm); __tlb_flush_full(mm);
} }
......
...@@ -112,7 +112,8 @@ void __init paging_init(void) ...@@ -112,7 +112,8 @@ void __init paging_init(void)
asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
pgd_type = _REGION3_ENTRY_EMPTY; pgd_type = _REGION3_ENTRY_EMPTY;
} }
S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
S390_lowcore.kernel_asce = init_mm.context.asce;
clear_table((unsigned long *) init_mm.pgd, pgd_type, clear_table((unsigned long *) init_mm.pgd, pgd_type,
sizeof(unsigned long)*2048); sizeof(unsigned long)*2048);
vmem_map_init(); vmem_map_init();
......
...@@ -184,7 +184,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) ...@@ -184,7 +184,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
if (!(flags & MAP_FIXED)) if (!(flags & MAP_FIXED))
addr = 0; addr = 0;
if ((addr + len) >= TASK_SIZE) if ((addr + len) >= TASK_SIZE)
return crst_table_upgrade(current->mm, 1UL << 53); return crst_table_upgrade(current->mm);
return 0; return 0;
} }
...@@ -201,7 +201,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr, ...@@ -201,7 +201,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
return area; return area;
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) { if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
/* Upgrade the page table to 4 levels and retry. */ /* Upgrade the page table to 4 levels and retry. */
rc = crst_table_upgrade(mm, 1UL << 53); rc = crst_table_upgrade(mm);
if (rc) if (rc)
return (unsigned long) rc; return (unsigned long) rc;
area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
...@@ -223,7 +223,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, ...@@ -223,7 +223,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
return area; return area;
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) { if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
/* Upgrade the page table to 4 levels and retry. */ /* Upgrade the page table to 4 levels and retry. */
rc = crst_table_upgrade(mm, 1UL << 53); rc = crst_table_upgrade(mm);
if (rc) if (rc)
return (unsigned long) rc; return (unsigned long) rc;
area = arch_get_unmapped_area_topdown(filp, addr, len, area = arch_get_unmapped_area_topdown(filp, addr, len,
......
...@@ -56,81 +56,52 @@ static void __crst_table_upgrade(void *arg) ...@@ -56,81 +56,52 @@ static void __crst_table_upgrade(void *arg)
__tlb_flush_local(); __tlb_flush_local();
} }
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) int crst_table_upgrade(struct mm_struct *mm)
{ {
unsigned long *table, *pgd; unsigned long *table, *pgd;
unsigned long entry;
int flush;
BUG_ON(limit > (1UL << 53)); /* upgrade should only happen from 3 to 4 levels */
flush = 0; BUG_ON(mm->context.asce_limit != (1UL << 42));
repeat:
table = crst_table_alloc(mm); table = crst_table_alloc(mm);
if (!table) if (!table)
return -ENOMEM; return -ENOMEM;
spin_lock_bh(&mm->page_table_lock); spin_lock_bh(&mm->page_table_lock);
if (mm->context.asce_limit < limit) { pgd = (unsigned long *) mm->pgd;
pgd = (unsigned long *) mm->pgd; crst_table_init(table, _REGION2_ENTRY_EMPTY);
if (mm->context.asce_limit <= (1UL << 31)) { pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
entry = _REGION3_ENTRY_EMPTY; mm->pgd = (pgd_t *) table;
mm->context.asce_limit = 1UL << 42; mm->context.asce_limit = 1UL << 53;
mm->context.asce_bits = _ASCE_TABLE_LENGTH | mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
_ASCE_TYPE_REGION3; mm->task_size = mm->context.asce_limit;
} else {
entry = _REGION2_ENTRY_EMPTY;
mm->context.asce_limit = 1UL << 53;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_REGION2;
}
crst_table_init(table, entry);
pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
mm->pgd = (pgd_t *) table;
mm->task_size = mm->context.asce_limit;
table = NULL;
flush = 1;
}
spin_unlock_bh(&mm->page_table_lock); spin_unlock_bh(&mm->page_table_lock);
if (table)
crst_table_free(mm, table); on_each_cpu(__crst_table_upgrade, mm, 0);
if (mm->context.asce_limit < limit)
goto repeat;
if (flush)
on_each_cpu(__crst_table_upgrade, mm, 0);
return 0; return 0;
} }
void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) void crst_table_downgrade(struct mm_struct *mm)
{ {
pgd_t *pgd; pgd_t *pgd;
/* downgrade should only happen from 3 to 2 levels (compat only) */
BUG_ON(mm->context.asce_limit != (1UL << 42));
if (current->active_mm == mm) { if (current->active_mm == mm) {
clear_user_asce(); clear_user_asce();
__tlb_flush_mm(mm); __tlb_flush_mm(mm);
} }
while (mm->context.asce_limit > limit) {
pgd = mm->pgd; pgd = mm->pgd;
switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
case _REGION_ENTRY_TYPE_R2: mm->context.asce_limit = 1UL << 31;
mm->context.asce_limit = 1UL << 42; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
_ASCE_USER_BITS | mm->task_size = mm->context.asce_limit;
_ASCE_TYPE_REGION3; crst_table_free(mm, (unsigned long *) pgd);
break;
case _REGION_ENTRY_TYPE_R3:
mm->context.asce_limit = 1UL << 31;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_SEGMENT;
break;
default:
BUG();
}
mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
mm->task_size = mm->context.asce_limit;
crst_table_free(mm, (unsigned long *) pgd);
}
if (current->active_mm == mm) if (current->active_mm == mm)
set_user_asce(mm); set_user_asce(mm);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment