Commit 460df4c1 authored by Paolo Bonzini's avatar Paolo Bonzini

KVM: race-free exit from KVM_RUN without POSIX signals

The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
a VCPU out of KVM_RUN through a POSIX signal.  A signal is attached
to a dummy signal handler; by blocking the signal outside KVM_RUN and
unblocking it inside, this possible race is closed:

          VCPU thread                     service thread
   --------------------------------------------------------------
        check flag
                                          set flag
                                          raise signal
        (signal handler does nothing)
        KVM_RUN

However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
tsk->sighand->siglock on every KVM_RUN.  This lock is often on a
remote NUMA node, because it is on the node of a thread's creator.
Taking this lock can be very expensive if there are many userspace
exits (as is the case for SMP Windows VMs without Hyper-V reference
time counter).

As an alternative, we can put the flag directly in kvm_run so that
KVM can see it:

          VCPU thread                     service thread
   --------------------------------------------------------------
                                          raise signal
        signal handler
          set run->immediate_exit
        KVM_RUN
          check run->immediate_exit
Reviewed-by: default avatarRadim Krčmář <rkrcmar@redhat.com>
Reviewed-by: default avatarDavid Hildenbrand <david@redhat.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent bbd64115
...@@ -3389,7 +3389,18 @@ struct kvm_run { ...@@ -3389,7 +3389,18 @@ struct kvm_run {
Request that KVM_RUN return when it becomes possible to inject external Request that KVM_RUN return when it becomes possible to inject external
interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
__u8 padding1[7]; __u8 immediate_exit;
This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
exits immediately, returning -EINTR. In the common scenario where a
signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
Rather than blocking the signal outside KVM_RUN, userspace can set up
a signal handler that sets run->immediate_exit to a non-zero value.
This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
__u8 padding1[6];
/* out */ /* out */
__u32 exit_reason; __u32 exit_reason;
......
...@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PSCI_0_2: case KVM_CAP_ARM_PSCI_0_2:
case KVM_CAP_READONLY_MEM: case KVM_CAP_READONLY_MEM:
case KVM_CAP_MP_STATE: case KVM_CAP_MP_STATE:
case KVM_CAP_IMMEDIATE_EXIT:
r = 1; r = 1;
break; break;
case KVM_CAP_COALESCED_MMIO: case KVM_CAP_COALESCED_MMIO:
...@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ...@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
return ret; return ret;
} }
if (run->immediate_exit)
return -EINTR;
if (vcpu->sigset_active) if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
......
...@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, ...@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
{ {
int r = 0; int r = -EINTR;
sigset_t sigsaved; sigset_t sigsaved;
if (vcpu->sigset_active) if (vcpu->sigset_active)
...@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ...@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu->mmio_needed = 0; vcpu->mmio_needed = 0;
} }
if (run->immediate_exit)
goto out;
lose_fpu(1); lose_fpu(1);
local_irq_disable(); local_irq_disable();
...@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ...@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
guest_exit_irqoff(); guest_exit_irqoff();
local_irq_enable(); local_irq_enable();
out:
if (vcpu->sigset_active) if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL); sigprocmask(SIG_SETMASK, &sigsaved, NULL);
...@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP: case KVM_CAP_ENABLE_CAP:
case KVM_CAP_READONLY_MEM: case KVM_CAP_READONLY_MEM:
case KVM_CAP_SYNC_MMU: case KVM_CAP_SYNC_MMU:
case KVM_CAP_IMMEDIATE_EXIT:
r = 1; r = 1;
break; break;
case KVM_CAP_COALESCED_MMIO: case KVM_CAP_COALESCED_MMIO:
......
...@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ONE_REG: case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL: case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_IMMEDIATE_EXIT:
r = 1; r = 1;
break; break;
case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_PAIRED_SINGLES:
...@@ -1117,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ...@@ -1117,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
#endif #endif
} }
r = kvmppc_vcpu_run(run, vcpu); if (run->immediate_exit)
r = -EINTR;
else
r = kvmppc_vcpu_run(run, vcpu);
if (vcpu->sigset_active) if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL); sigprocmask(SIG_SETMASK, &sigsaved, NULL);
......
...@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_IRQCHIP: case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE: case KVM_CAP_MP_STATE:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_S390_INJECT_IRQ: case KVM_CAP_S390_INJECT_IRQ:
case KVM_CAP_S390_USER_SIGP: case KVM_CAP_S390_USER_SIGP:
case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_USER_STSI:
...@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int rc; int rc;
sigset_t sigsaved; sigset_t sigsaved;
if (kvm_run->immediate_exit)
return -EINTR;
if (guestdbg_exit_pending(vcpu)) { if (guestdbg_exit_pending(vcpu)) {
kvm_s390_prepare_debug_exit(vcpu); kvm_s390_prepare_debug_exit(vcpu);
return 0; return 0;
......
...@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3: case KVM_CAP_PCI_2_3:
...@@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} else } else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
r = vcpu_run(vcpu); if (kvm_run->immediate_exit)
r = -EINTR;
else
r = vcpu_run(vcpu);
out: out:
post_kvm_run_save(vcpu); post_kvm_run_save(vcpu);
......
...@@ -218,7 +218,8 @@ struct kvm_hyperv_exit { ...@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
struct kvm_run { struct kvm_run {
/* in */ /* in */
__u8 request_interrupt_window; __u8 request_interrupt_window;
__u8 padding1[7]; __u8 immediate_exit;
__u8 padding1[6];
/* out */ /* out */
__u32 exit_reason; __u32 exit_reason;
...@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt { ...@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_SPAPR_RESIZE_HPT 133 #define KVM_CAP_SPAPR_RESIZE_HPT 133
#define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_RADIX 134
#define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_PPC_MMU_HASH_V3 135
#define KVM_CAP_IMMEDIATE_EXIT 136
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment