Commit a2b7861b authored by Boqun Feng's avatar Boqun Feng Committed by Radim Krčmář

kvm/x86: Avoid async PF preempting the kernel incorrectly

Currently, in PREEMPT_COUNT=n kernel, kvm_async_pf_task_wait() could call
schedule() to reschedule in some cases.  This could result in
accidentally ending the current RCU read-side critical section early,
causing random memory corruption in the guest, or otherwise preempting
the currently running task inside between preempt_disable and
preempt_enable.

The difficulty to handle this well is because we don't know whether an
async PF delivered in a preemptible section or RCU read-side critical section
for PREEMPT_COUNT=n, since preempt_disable()/enable() and rcu_read_lock/unlock()
are both no-ops in that case.

To cure this, we treat any async PF interrupting a kernel context as one
that cannot be preempted, preventing kvm_async_pf_task_wait() from choosing
the schedule() path in that case.

To do so, a second parameter for kvm_async_pf_task_wait() is introduced,
so that we know whether it's called from a context interrupting the
kernel, and the parameter is set properly in all the callsites.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: default avatarBoqun Feng <boqun.feng@gmail.com>
Signed-off-by: default avatarRadim Krčmář <rkrcmar@redhat.com>
parent 2fb1e946
...@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, ...@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
bool kvm_para_available(void); bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_features(void);
void __init kvm_guest_init(void); void __init kvm_guest_init(void);
void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
void kvm_async_pf_task_wake(u32 token); void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void); u32 kvm_read_and_reset_pf_reason(void);
extern void kvm_disable_steal_time(void); extern void kvm_disable_steal_time(void);
...@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void) ...@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void)
#else /* CONFIG_KVM_GUEST */ #else /* CONFIG_KVM_GUEST */
#define kvm_guest_init() do {} while (0) #define kvm_guest_init() do {} while (0)
#define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wait(T, I) do {} while(0)
#define kvm_async_pf_task_wake(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0)
static inline bool kvm_para_available(void) static inline bool kvm_para_available(void)
......
...@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, ...@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
return NULL; return NULL;
} }
void kvm_async_pf_task_wait(u32 token) /*
* @interrupt_kernel: Is this called from a routine which interrupts the kernel
* (other than user space)?
*/
void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
{ {
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
...@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token) ...@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token; n.token = token;
n.cpu = smp_processor_id(); n.cpu = smp_processor_id();
n.halted = is_idle_task(current) || preempt_count() > 1 || n.halted = is_idle_task(current) ||
rcu_preempt_depth(); (IS_ENABLED(CONFIG_PREEMPT_COUNT)
? preempt_count() > 1 || rcu_preempt_depth()
: interrupt_kernel);
init_swait_queue_head(&n.wq); init_swait_queue_head(&n.wq);
hlist_add_head(&n.link, &b->list); hlist_add_head(&n.link, &b->list);
raw_spin_unlock(&b->lock); raw_spin_unlock(&b->lock);
...@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) ...@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
case KVM_PV_REASON_PAGE_NOT_PRESENT: case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */ /* page is swapped out by the host. */
prev_state = exception_enter(); prev_state = exception_enter();
kvm_async_pf_task_wait((u32)read_cr2()); kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
exception_exit(prev_state); exception_exit(prev_state);
break; break;
case KVM_PV_REASON_PAGE_READY: case KVM_PV_REASON_PAGE_READY:
......
...@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, ...@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
case KVM_PV_REASON_PAGE_NOT_PRESENT: case KVM_PV_REASON_PAGE_NOT_PRESENT:
vcpu->arch.apf.host_apf_reason = 0; vcpu->arch.apf.host_apf_reason = 0;
local_irq_disable(); local_irq_disable();
kvm_async_pf_task_wait(fault_address); kvm_async_pf_task_wait(fault_address, 0);
local_irq_enable(); local_irq_enable();
break; break;
case KVM_PV_REASON_PAGE_READY: case KVM_PV_REASON_PAGE_READY:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment