Commit 7d621599 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'hyperv-next-signed-20221208' of...

Merge tag 'hyperv-next-signed-20221208' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv updates from Wei Liu:

 - Drop unregister syscore from hyperv_cleanup to avoid hang (Gaurav
   Kohli)

 - Clean up panic path for Hyper-V framebuffer (Guilherme G. Piccoli)

 - Allow IRQ remapping to work without x2apic (Nuno Das Neves)

 - Fix comments (Olaf Hering)

 - Expand hv_vp_assist_page definition (Saurabh Sengar)

 - Improvement to page reporting (Shradha Gupta)

 - Make sure TSC clocksource works when Linux runs as the root partition
   (Stanislav Kinsburskiy)

* tag 'hyperv-next-signed-20221208' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux:
  x86/hyperv: Remove unregister syscore call from Hyper-V cleanup
  iommu/hyper-v: Allow hyperv irq remapping without x2apic
  clocksource: hyper-v: Add TSC page support for root partition
  clocksource: hyper-v: Use TSC PFN getter to map vvar page
  clocksource: hyper-v: Introduce TSC PFN getter
  clocksource: hyper-v: Introduce a pointer to TSC page
  x86/hyperv: Expand definition of struct hv_vp_assist_page
  PCI: hv: update comment in x86 specific hv_arch_irq_unmask
  hv: fix comment typo in vmbus_channel/low_latency
  drivers: hv, hyperv_fb: Untangle and refactor Hyper-V panic notifiers
  video: hyperv_fb: Avoid taking busy spinlock on panic path
  hv_balloon: Add support for configurable order free page reporting
  mm/page_reporting: Add checks for page_reporting_order param
parents 43686598 32c97d98
...@@ -210,11 +210,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, ...@@ -210,11 +210,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
pgprot_decrypted(vma->vm_page_prot)); pgprot_decrypted(vma->vm_page_prot));
} }
} else if (sym_offset == image->sym_hvclock_page) { } else if (sym_offset == image->sym_hvclock_page) {
struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); pfn = hv_get_tsc_pfn();
if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
return vmf_insert_pfn(vma, vmf->address, return vmf_insert_pfn(vma, vmf->address, pfn);
virt_to_phys(tsc_pg) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_timens_page) { } else if (sym_offset == image->sym_timens_page) {
struct page *timens_page = find_timens_vvar_page(vma); struct page *timens_page = find_timens_vvar_page(vma);
......
...@@ -462,6 +462,8 @@ void __init hyperv_init(void) ...@@ -462,6 +462,8 @@ void __init hyperv_init(void)
BUG_ON(!src); BUG_ON(!src);
memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE); memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE);
memunmap(src); memunmap(src);
hv_remap_tsc_clocksource();
} else { } else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
...@@ -535,8 +537,6 @@ void hyperv_cleanup(void) ...@@ -535,8 +537,6 @@ void hyperv_cleanup(void)
union hv_x64_msr_hypercall_contents hypercall_msr; union hv_x64_msr_hypercall_contents hypercall_msr;
union hv_reference_tsc_msr tsc_msr; union hv_reference_tsc_msr tsc_msr;
unregister_syscore_ops(&hv_syscore_ops);
/* Reset our OS id */ /* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
......
...@@ -374,11 +374,20 @@ struct hv_nested_enlightenments_control { ...@@ -374,11 +374,20 @@ struct hv_nested_enlightenments_control {
struct hv_vp_assist_page { struct hv_vp_assist_page {
__u32 apic_assist; __u32 apic_assist;
__u32 reserved1; __u32 reserved1;
__u64 vtl_control[3]; __u32 vtl_entry_reason;
__u32 vtl_reserved;
__u64 vtl_ret_x64rax;
__u64 vtl_ret_x64rcx;
struct hv_nested_enlightenments_control nested_control; struct hv_nested_enlightenments_control nested_control;
__u8 enlighten_vmentry; __u8 enlighten_vmentry;
__u8 reserved2[7]; __u8 reserved2[7];
__u64 current_nested_vmcs; __u64 current_nested_vmcs;
__u8 synthetic_time_unhalted_timer_expired;
__u8 reserved3[7];
__u8 virtualization_fault_information[40];
__u8 reserved4[8];
__u8 intercept_message[256];
__u8 vtl_ret_actions[256];
} __packed; } __packed;
struct hv_enlightened_vmcs { struct hv_enlightened_vmcs {
......
...@@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void) ...@@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void)
* (logically) generates MSIs directly to the system APIC irq domain. * (logically) generates MSIs directly to the system APIC irq domain.
* There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
* pci-hyperv host bridge. * pci-hyperv host bridge.
*
* Note: for a Hyper-V root partition, this will always return false.
* The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
* default, they are implemented as intercepts by the Windows Hyper-V stack.
* Even a nested root partition (L2 root) will not get them because the
* nested (L1) hypervisor filters them out.
*/ */
static bool __init ms_hyperv_msi_ext_dest_id(void) static bool __init ms_hyperv_msi_ext_dest_id(void)
{ {
......
...@@ -367,9 +367,18 @@ static union { ...@@ -367,9 +367,18 @@ static union {
u8 reserved[PAGE_SIZE]; u8 reserved[PAGE_SIZE];
} tsc_pg __aligned(PAGE_SIZE); } tsc_pg __aligned(PAGE_SIZE);
static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page;
static unsigned long tsc_pfn;
unsigned long hv_get_tsc_pfn(void)
{
return tsc_pfn;
}
EXPORT_SYMBOL_GPL(hv_get_tsc_pfn);
struct ms_hyperv_tsc_page *hv_get_tsc_page(void) struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{ {
return &tsc_pg.page; return tsc_page;
} }
EXPORT_SYMBOL_GPL(hv_get_tsc_page); EXPORT_SYMBOL_GPL(hv_get_tsc_page);
...@@ -407,13 +416,12 @@ static void suspend_hv_clock_tsc(struct clocksource *arg) ...@@ -407,13 +416,12 @@ static void suspend_hv_clock_tsc(struct clocksource *arg)
static void resume_hv_clock_tsc(struct clocksource *arg) static void resume_hv_clock_tsc(struct clocksource *arg)
{ {
phys_addr_t phys_addr = virt_to_phys(&tsc_pg);
union hv_reference_tsc_msr tsc_msr; union hv_reference_tsc_msr tsc_msr;
/* Re-enable the TSC page */ /* Re-enable the TSC page */
tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC);
tsc_msr.enable = 1; tsc_msr.enable = 1;
tsc_msr.pfn = HVPFN_DOWN(phys_addr); tsc_msr.pfn = tsc_pfn;
hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64);
} }
...@@ -497,14 +505,10 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock) {} ...@@ -497,14 +505,10 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock) {}
static bool __init hv_init_tsc_clocksource(void) static bool __init hv_init_tsc_clocksource(void)
{ {
union hv_reference_tsc_msr tsc_msr; union hv_reference_tsc_msr tsc_msr;
phys_addr_t phys_addr;
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return false; return false;
if (hv_root_partition)
return false;
/* /*
* If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly
* handles frequency and offset changes due to live migration, * handles frequency and offset changes due to live migration,
...@@ -522,18 +526,30 @@ static bool __init hv_init_tsc_clocksource(void) ...@@ -522,18 +526,30 @@ static bool __init hv_init_tsc_clocksource(void)
} }
hv_read_reference_counter = read_hv_clock_tsc; hv_read_reference_counter = read_hv_clock_tsc;
phys_addr = virt_to_phys(hv_get_tsc_page());
/* /*
* The Hyper-V TLFS specifies to preserve the value of reserved * TSC page mapping works differently in root compared to guest.
* bits in registers. So read the existing value, preserve the * - In guest partition the guest PFN has to be passed to the
* low order 12 bits, and add in the guest physical address * hypervisor.
* (which already has at least the low 12 bits set to zero since * - In root partition it's other way around: it has to map the PFN
* it is page aligned). Also set the "enable" bit, which is bit 0. * provided by the hypervisor.
* But it can't be mapped right here as it's too early and MMU isn't
* ready yet. So, we only set the enable bit here and will remap the
* page later in hv_remap_tsc_clocksource().
*
* It worth mentioning, that TSC clocksource read function
* (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when
* TSC page is zeroed (which is the case until the PFN is remapped) and
* thus TSC clocksource will work even without the real TSC page
* mapped.
*/ */
tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC);
if (hv_root_partition)
tsc_pfn = tsc_msr.pfn;
else
tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page));
tsc_msr.enable = 1; tsc_msr.enable = 1;
tsc_msr.pfn = HVPFN_DOWN(phys_addr); tsc_msr.pfn = tsc_pfn;
hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64);
clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
...@@ -566,3 +582,20 @@ void __init hv_init_clocksource(void) ...@@ -566,3 +582,20 @@ void __init hv_init_clocksource(void)
hv_sched_clock_offset = hv_read_reference_counter(); hv_sched_clock_offset = hv_read_reference_counter();
hv_setup_sched_clock(read_hv_sched_clock_msr); hv_setup_sched_clock(read_hv_sched_clock_msr);
} }
void __init hv_remap_tsc_clocksource(void)
{
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return;
if (!hv_root_partition) {
WARN(1, "%s: attempt to remap TSC page in guest partition\n",
__func__);
return;
}
tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg),
MEMREMAP_WB);
if (!tsc_page)
pr_err("Failed to remap Hyper-V TSC page.\n");
}
...@@ -469,12 +469,16 @@ static bool do_hot_add; ...@@ -469,12 +469,16 @@ static bool do_hot_add;
* the specified number of seconds. * the specified number of seconds.
*/ */
static uint pressure_report_delay = 45; static uint pressure_report_delay = 45;
extern unsigned int page_reporting_order;
#define HV_MAX_FAILURES 2
/* /*
* The last time we posted a pressure report to host. * The last time we posted a pressure report to host.
*/ */
static unsigned long last_post_time; static unsigned long last_post_time;
static int hv_hypercall_multi_failure;
module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
...@@ -579,6 +583,10 @@ static struct hv_dynmem_device dm_device; ...@@ -579,6 +583,10 @@ static struct hv_dynmem_device dm_device;
static void post_status(struct hv_dynmem_device *dm); static void post_status(struct hv_dynmem_device *dm);
static void enable_page_reporting(void);
static void disable_page_reporting(void);
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
unsigned long pfn) unsigned long pfn)
...@@ -1418,6 +1426,18 @@ static int dm_thread_func(void *dm_dev) ...@@ -1418,6 +1426,18 @@ static int dm_thread_func(void *dm_dev)
*/ */
reinit_completion(&dm_device.config_event); reinit_completion(&dm_device.config_event);
post_status(dm); post_status(dm);
/*
* disable free page reporting if multiple hypercall
* failure flag set. It is not done in the page_reporting
* callback context as that causes a deadlock between
* page_reporting_process() and page_reporting_unregister()
*/
if (hv_hypercall_multi_failure >= HV_MAX_FAILURES) {
pr_err("Multiple failures in cold memory discard hypercall, disabling page reporting\n");
disable_page_reporting();
/* Reset the flag after disabling reporting */
hv_hypercall_multi_failure = 0;
}
} }
return 0; return 0;
...@@ -1593,20 +1613,20 @@ static void balloon_onchannelcallback(void *context) ...@@ -1593,20 +1613,20 @@ static void balloon_onchannelcallback(void *context)
} }
/* Hyper-V only supports reporting 2MB pages or higher */ #define HV_LARGE_REPORTING_ORDER 9
#define HV_MIN_PAGE_REPORTING_ORDER 9 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \
#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER) HV_LARGE_REPORTING_ORDER)
static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
struct scatterlist *sgl, unsigned int nents) struct scatterlist *sgl, unsigned int nents)
{ {
unsigned long flags; unsigned long flags;
struct hv_memory_hint *hint; struct hv_memory_hint *hint;
int i; int i, order;
u64 status; u64 status;
struct scatterlist *sg; struct scatterlist *sg;
WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN); WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order));
local_irq_save(flags); local_irq_save(flags);
hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
if (!hint) { if (!hint) {
...@@ -1621,21 +1641,53 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, ...@@ -1621,21 +1641,53 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
range = &hint->ranges[i]; range = &hint->ranges[i];
range->address_space = 0; range->address_space = 0;
/* page reporting only reports 2MB pages or higher */ order = get_order(sg->length);
/*
* Hyper-V expects the additional_pages field in the units
* of one of these 3 sizes, 4Kbytes, 2Mbytes or 1Gbytes.
* This is dictated by the values of the fields page.largesize
* and page_size.
* This code however, only uses 4Kbytes and 2Mbytes units
* and not 1Gbytes unit.
*/
/* page reporting for pages 2MB or higher */
if (order >= HV_LARGE_REPORTING_ORDER ) {
range->page.largepage = 1; range->page.largepage = 1;
range->page.additional_pages =
(sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
range->base_large_pfn = range->base_large_pfn = page_to_hvpfn(
page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER; sg_page(sg)) >> HV_LARGE_REPORTING_ORDER;
range->page.additional_pages =
(sg->length / HV_LARGE_REPORTING_LEN) - 1;
} else {
/* Page reporting for pages below 2MB */
range->page.basepfn = page_to_hvpfn(sg_page(sg));
range->page.largepage = false;
range->page.additional_pages =
(sg->length / HV_HYP_PAGE_SIZE) - 1;
}
} }
status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
hint, NULL); hint, NULL);
local_irq_restore(flags); local_irq_restore(flags);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { if (!hv_result_success(status)) {
pr_err("Cold memory discard hypercall failed with status %llx\n", pr_err("Cold memory discard hypercall failed with status %llx\n",
status); status);
if (hv_hypercall_multi_failure > 0)
hv_hypercall_multi_failure++;
if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n");
pr_err("Defaulting to page_reporting_order %d\n",
pageblock_order);
page_reporting_order = pageblock_order;
hv_hypercall_multi_failure++;
return -EINVAL;
}
return -EINVAL; return -EINVAL;
} }
...@@ -1646,12 +1698,6 @@ static void enable_page_reporting(void) ...@@ -1646,12 +1698,6 @@ static void enable_page_reporting(void)
{ {
int ret; int ret;
/* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
pr_debug("Cold memory discard is only supported on 2MB pages and above\n");
return;
}
if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) { if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
pr_debug("Cold memory discard hint not supported by Hyper-V\n"); pr_debug("Cold memory discard hint not supported by Hyper-V\n");
return; return;
...@@ -1659,12 +1705,18 @@ static void enable_page_reporting(void) ...@@ -1659,12 +1705,18 @@ static void enable_page_reporting(void)
BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
dm_device.pr_dev_info.report = hv_free_page_report; dm_device.pr_dev_info.report = hv_free_page_report;
/*
* We let the page_reporting_order parameter decide the order
* in the page_reporting code
*/
dm_device.pr_dev_info.order = 0;
ret = page_reporting_register(&dm_device.pr_dev_info); ret = page_reporting_register(&dm_device.pr_dev_info);
if (ret < 0) { if (ret < 0) {
dm_device.pr_dev_info.report = NULL; dm_device.pr_dev_info.report = NULL;
pr_err("Failed to enable cold memory discard: %d\n", ret); pr_err("Failed to enable cold memory discard: %d\n", ret);
} else { } else {
pr_info("Cold memory discard hint enabled\n"); pr_info("Cold memory discard hint enabled with order %d\n",
page_reporting_order);
} }
} }
......
...@@ -280,6 +280,19 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) ...@@ -280,6 +280,19 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
ring_info->pkt_buffer_size = 0; ring_info->pkt_buffer_size = 0;
} }
/*
* Check if the ring buffer spinlock is available to take or not; used on
* atomic contexts, like panic path (see the Hyper-V framebuffer driver).
*/
bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel)
{
struct hv_ring_buffer_info *rinfo = &channel->outbound;
return spin_is_locked(&rinfo->ring_lock);
}
EXPORT_SYMBOL_GPL(hv_ringbuffer_spinlock_busy);
/* Write to the ring buffer. */ /* Write to the ring buffer. */
int hv_ringbuffer_write(struct vmbus_channel *channel, int hv_ringbuffer_write(struct vmbus_channel *channel,
const struct kvec *kv_list, u32 kv_count, const struct kvec *kv_list, u32 kv_count,
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h> #include <linux/panic_notifier.h>
#include <linux/ptrace.h> #include <linux/ptrace.h>
#include <linux/screen_info.h> #include <linux/screen_info.h>
...@@ -68,53 +67,74 @@ static int hyperv_report_reg(void) ...@@ -68,53 +67,74 @@ static int hyperv_report_reg(void)
return !sysctl_record_panic_msg || !hv_panic_page; return !sysctl_record_panic_msg || !hv_panic_page;
} }
static int hyperv_panic_event(struct notifier_block *nb, unsigned long val, /*
* The panic notifier below is responsible solely for unloading the
* vmbus connection, which is necessary in a panic event.
*
* Notice an intrincate relation of this notifier with Hyper-V
* framebuffer panic notifier exists - we need vmbus connection alive
* there in order to succeed, so we need to order both with each other
* [see hvfb_on_panic()] - this is done using notifiers' priorities.
*/
static int hv_panic_vmbus_unload(struct notifier_block *nb, unsigned long val,
void *args) void *args)
{ {
struct pt_regs *regs;
vmbus_initiate_unload(true); vmbus_initiate_unload(true);
/*
* Hyper-V should be notified only once about a panic. If we will be
* doing hv_kmsg_dump() with kmsg data later, don't do the notification
* here.
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE
&& hyperv_report_reg()) {
regs = current_pt_regs();
hyperv_report_panic(regs, val, false);
}
return NOTIFY_DONE; return NOTIFY_DONE;
} }
static struct notifier_block hyperv_panic_vmbus_unload_block = {
.notifier_call = hv_panic_vmbus_unload,
.priority = INT_MIN + 1, /* almost the latest one to execute */
};
static int hyperv_die_event(struct notifier_block *nb, unsigned long val, static int hv_die_panic_notify_crash(struct notifier_block *self,
void *args) unsigned long val, void *args);
static struct notifier_block hyperv_die_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
static struct notifier_block hyperv_panic_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
/*
* The following callback works both as die and panic notifier; its
* goal is to provide panic information to the hypervisor unless the
* kmsg dumper is used [see hv_kmsg_dump()], which provides more
* information but isn't always available.
*
* Notice that both the panic/die report notifiers are registered only
* if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
*/
static int hv_die_panic_notify_crash(struct notifier_block *self,
unsigned long val, void *args)
{ {
struct die_args *die = args; struct pt_regs *regs;
struct pt_regs *regs = die->regs; bool is_die;
/* Don't notify Hyper-V if the die event is other than oops */ /* Don't notify Hyper-V unless we have a die oops event or panic. */
if (self == &hyperv_panic_report_block) {
is_die = false;
regs = current_pt_regs();
} else { /* die event */
if (val != DIE_OOPS) if (val != DIE_OOPS)
return NOTIFY_DONE; return NOTIFY_DONE;
is_die = true;
regs = ((struct die_args *)args)->regs;
}
/* /*
* Hyper-V should be notified only once about a panic. If we will be * Hyper-V should be notified only once about a panic/die. If we will
* doing hv_kmsg_dump() with kmsg data later, don't do the notification * be calling hv_kmsg_dump() later with kmsg data, don't do the
* here. * notification here.
*/ */
if (hyperv_report_reg()) if (hyperv_report_reg())
hyperv_report_panic(regs, val, true); hyperv_report_panic(regs, val, is_die);
return NOTIFY_DONE; return NOTIFY_DONE;
} }
static struct notifier_block hyperv_die_block = {
.notifier_call = hyperv_die_event,
};
static struct notifier_block hyperv_panic_block = {
.notifier_call = hyperv_panic_event,
};
static const char *fb_mmio_name = "fb_range"; static const char *fb_mmio_name = "fb_range";
static struct resource *fb_mmio; static struct resource *fb_mmio;
static struct resource *hyperv_mmio; static struct resource *hyperv_mmio;
...@@ -1538,16 +1558,17 @@ static int vmbus_bus_init(void) ...@@ -1538,16 +1558,17 @@ static int vmbus_bus_init(void)
if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
hv_kmsg_dump_register(); hv_kmsg_dump_register();
register_die_notifier(&hyperv_die_block); register_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_register(&panic_notifier_list,
&hyperv_panic_report_block);
} }
/* /*
* Always register the panic notifier because we need to unload * Always register the vmbus unload panic notifier because we
* the VMbus channel connection to prevent any VMbus * need to shut the VMbus channel connection on panic.
* activity after the VM panics.
*/ */
atomic_notifier_chain_register(&panic_notifier_list, atomic_notifier_chain_register(&panic_notifier_list,
&hyperv_panic_block); &hyperv_panic_vmbus_unload_block);
vmbus_request_offers(); vmbus_request_offers();
...@@ -2800,15 +2821,17 @@ static void __exit vmbus_exit(void) ...@@ -2800,15 +2821,17 @@ static void __exit vmbus_exit(void)
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
kmsg_dump_unregister(&hv_kmsg_dumper); kmsg_dump_unregister(&hv_kmsg_dumper);
unregister_die_notifier(&hyperv_die_block); unregister_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_report_block);
} }
/* /*
* The panic notifier is always registered, hence we should * The vmbus panic notifier is always registered, hence we should
* also unconditionally unregister it here as well. * also unconditionally unregister it here as well.
*/ */
atomic_notifier_chain_unregister(&panic_notifier_list, atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_block); &hyperv_panic_vmbus_unload_block);
free_page((unsigned long)hv_panic_page); free_page((unsigned long)hv_panic_page);
unregister_sysctl_table(hv_ctl_table_hdr); unregister_sysctl_table(hv_ctl_table_hdr);
......
...@@ -474,13 +474,13 @@ config QCOM_IOMMU ...@@ -474,13 +474,13 @@ config QCOM_IOMMU
Support for IOMMU on certain Qualcomm SoCs. Support for IOMMU on certain Qualcomm SoCs.
config HYPERV_IOMMU config HYPERV_IOMMU
bool "Hyper-V x2APIC IRQ Handling" bool "Hyper-V IRQ Handling"
depends on HYPERV && X86 depends on HYPERV && X86
select IOMMU_API select IOMMU_API
default HYPERV default HYPERV
help help
Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux Stub IOMMU driver to handle IRQs to support Hyper-V Linux
guests to run with x2APIC mode enabled. guest and root partitions.
config VIRTIO_IOMMU config VIRTIO_IOMMU
tristate "Virtio IOMMU driver" tristate "Virtio IOMMU driver"
......
...@@ -122,9 +122,12 @@ static int __init hyperv_prepare_irq_remapping(void) ...@@ -122,9 +122,12 @@ static int __init hyperv_prepare_irq_remapping(void)
const char *name; const char *name;
const struct irq_domain_ops *ops; const struct irq_domain_ops *ops;
/*
* For a Hyper-V root partition, ms_hyperv_msi_ext_dest_id()
* will always return false.
*/
if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) || if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) ||
x86_init.hyper.msi_ext_dest_id() || x86_init.hyper.msi_ext_dest_id())
!x2apic_supported())
return -ENODEV; return -ENODEV;
if (hv_root_partition) { if (hv_root_partition) {
...@@ -170,7 +173,9 @@ static int __init hyperv_prepare_irq_remapping(void) ...@@ -170,7 +173,9 @@ static int __init hyperv_prepare_irq_remapping(void)
static int __init hyperv_enable_irq_remapping(void) static int __init hyperv_enable_irq_remapping(void)
{ {
if (x2apic_supported())
return IRQ_REMAP_X2APIC_MODE; return IRQ_REMAP_X2APIC_MODE;
return IRQ_REMAP_XAPIC_MODE;
} }
struct irq_remap_ops hyperv_irq_remap_ops = { struct irq_remap_ops hyperv_irq_remap_ops = {
......
...@@ -735,9 +735,9 @@ static void hv_arch_irq_unmask(struct irq_data *data) ...@@ -735,9 +735,9 @@ static void hv_arch_irq_unmask(struct irq_data *data)
* during hibernation does not matter (at this time all the devices * during hibernation does not matter (at this time all the devices
* have been frozen). Note: the correct affinity info is still updated * have been frozen). Note: the correct affinity info is still updated
* into the irqdata data structure in migrate_one_irq() -> * into the irqdata data structure in migrate_one_irq() ->
* irq_do_set_affinity() -> hv_set_affinity(), so later when the VM * irq_do_set_affinity(), so later when the VM resumes,
* resumes, hv_pci_restore_msi_state() is able to correctly restore * hv_pci_restore_msi_state() is able to correctly restore the
* the interrupt with the correct affinity. * interrupt with the correct affinity.
*/ */
if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
dev_err(&hbus->hdev->device, dev_err(&hbus->hdev->device,
......
...@@ -780,12 +780,18 @@ static void hvfb_ondemand_refresh_throttle(struct hvfb_par *par, ...@@ -780,12 +780,18 @@ static void hvfb_ondemand_refresh_throttle(struct hvfb_par *par,
static int hvfb_on_panic(struct notifier_block *nb, static int hvfb_on_panic(struct notifier_block *nb,
unsigned long e, void *p) unsigned long e, void *p)
{ {
struct hv_device *hdev;
struct hvfb_par *par; struct hvfb_par *par;
struct fb_info *info; struct fb_info *info;
par = container_of(nb, struct hvfb_par, hvfb_panic_nb); par = container_of(nb, struct hvfb_par, hvfb_panic_nb);
par->synchronous_fb = true;
info = par->info; info = par->info;
hdev = device_to_hv_device(info->device);
if (hv_ringbuffer_spinlock_busy(hdev->channel))
return NOTIFY_DONE;
par->synchronous_fb = true;
if (par->need_docopy) if (par->need_docopy)
hvfb_docopy(par, 0, dio_fb_size); hvfb_docopy(par, 0, dio_fb_size);
synthvid_update(info, 0, 0, INT_MAX, INT_MAX); synthvid_update(info, 0, 0, INT_MAX, INT_MAX);
...@@ -1208,7 +1214,15 @@ static int hvfb_probe(struct hv_device *hdev, ...@@ -1208,7 +1214,15 @@ static int hvfb_probe(struct hv_device *hdev,
par->fb_ready = true; par->fb_ready = true;
par->synchronous_fb = false; par->synchronous_fb = false;
/*
* We need to be sure this panic notifier runs _before_ the
* vmbus disconnect, so order it by priority. It must execute
* before the function hv_panic_vmbus_unload() [drivers/hv/vmbus_drv.c],
* which is almost at the end of list, with priority = INT_MIN + 1.
*/
par->hvfb_panic_nb.notifier_call = hvfb_on_panic; par->hvfb_panic_nb.notifier_call = hvfb_on_panic;
par->hvfb_panic_nb.priority = INT_MIN + 10,
atomic_notifier_chain_register(&panic_notifier_list, atomic_notifier_chain_register(&panic_notifier_list,
&par->hvfb_panic_nb); &par->hvfb_panic_nb);
......
...@@ -31,7 +31,9 @@ extern void hv_stimer_global_cleanup(void); ...@@ -31,7 +31,9 @@ extern void hv_stimer_global_cleanup(void);
extern void hv_stimer0_isr(void); extern void hv_stimer0_isr(void);
extern void hv_init_clocksource(void); extern void hv_init_clocksource(void);
extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
static inline notrace u64 static inline notrace u64
...@@ -90,6 +92,11 @@ hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) ...@@ -90,6 +92,11 @@ hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
} }
#else /* CONFIG_HYPERV_TIMER */ #else /* CONFIG_HYPERV_TIMER */
static inline unsigned long hv_get_tsc_pfn(void)
{
return 0;
}
static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{ {
return NULL; return NULL;
......
...@@ -969,7 +969,7 @@ struct vmbus_channel { ...@@ -969,7 +969,7 @@ struct vmbus_channel {
* mechanism improves throughput by: * mechanism improves throughput by:
* *
* A) Making the host more efficient - each time it wakes up, * A) Making the host more efficient - each time it wakes up,
* potentially it will process morev number of packets. The * potentially it will process more number of packets. The
* monitor latency allows a batch to build up. * monitor latency allows a batch to build up.
* B) By deferring the hypercall to signal, we will also minimize * B) By deferring the hypercall to signal, we will also minimize
* the interrupts. * the interrupts.
...@@ -1341,6 +1341,8 @@ struct hv_ring_buffer_debug_info { ...@@ -1341,6 +1341,8 @@ struct hv_ring_buffer_debug_info {
int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info, int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
struct hv_ring_buffer_debug_info *debug_info); struct hv_ring_buffer_debug_info *debug_info);
bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel);
/* Vmbus interface */ /* Vmbus interface */
#define vmbus_driver_register(driver) \ #define vmbus_driver_register(driver) \
__vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) __vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
......
...@@ -11,10 +11,42 @@ ...@@ -11,10 +11,42 @@
#include "page_reporting.h" #include "page_reporting.h"
#include "internal.h" #include "internal.h"
unsigned int page_reporting_order = MAX_ORDER; /* Initialize to an unsupported value */
module_param(page_reporting_order, uint, 0644); unsigned int page_reporting_order = -1;
static int page_order_update_notify(const char *val, const struct kernel_param *kp)
{
/*
* If param is set beyond this limit, order is set to default
* pageblock_order value
*/
return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1);
}
static const struct kernel_param_ops page_reporting_param_ops = {
.set = &page_order_update_notify,
/*
* For the get op, use param_get_int instead of param_get_uint.
* This is to make sure that when unset the initialized value of
* -1 is shown correctly
*/
.get = &param_get_int,
};
module_param_cb(page_reporting_order, &page_reporting_param_ops,
&page_reporting_order, 0644);
MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
/*
* This symbol is also a kernel parameter. Export the page_reporting_order
* symbol so that other drivers can access it to control order values without
* having to introduce another configurable parameter. Only one driver can
* register with the page_reporting driver for the service, so we have just
* one control parameter for the use case(which can be accessed in both
* drivers)
*/
EXPORT_SYMBOL_GPL(page_reporting_order);
#define PAGE_REPORTING_DELAY (2 * HZ) #define PAGE_REPORTING_DELAY (2 * HZ)
static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
...@@ -330,10 +362,18 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) ...@@ -330,10 +362,18 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
} }
/* /*
* Update the page reporting order if it's specified by driver. * If the page_reporting_order value is not set, we check if
* Otherwise, it falls back to @pageblock_order. * an order is provided from the driver that is performing the
* registration. If that is not provided either, we default to
* pageblock_order.
*/ */
page_reporting_order = prdev->order ? : pageblock_order;
if (page_reporting_order == -1) {
if (prdev->order > 0 && prdev->order <= MAX_ORDER)
page_reporting_order = prdev->order;
else
page_reporting_order = pageblock_order;
}
/* initialize state and work structures */ /* initialize state and work structures */
atomic_set(&prdev->state, PAGE_REPORTING_IDLE); atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment