Commit 081f359e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'hyperv-fixes-signed-20221125' of...

Merge tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv fixes from Wei Liu:

 - Fix IRTE allocation in Hyper-V PCI controller (Dexuan Cui)

 - Fix handling of SCSI srb_status and capacity change events (Michael
   Kelley)

 - Restore VP assist page after CPU offlining and onlining (Vitaly
   Kuznetsov)

 - Fix some memory leak issues in VMBus (Yang Yingliang)

* tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux:
  Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register()
  Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work()
  PCI: hv: Only reuse existing IRTE allocation for Multi-MSI
  scsi: storvsc: Fix handling of srb_status and capacity change events
  x86/hyperv: Restore VP assist page after cpu offlining/onlining
parents 0b1dcc2c 25c94b05
......@@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
int ret;
ret = hv_common_cpu_init(cpu);
......@@ -87,34 +87,32 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
if (!*hvp) {
if (hv_root_partition) {
/*
* For root partition we get the hypervisor provided VP assist
* page, instead of allocating a new page.
*/
rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
*hvp = memremap(msr.pfn <<
HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
PAGE_SIZE, MEMREMAP_WB);
} else {
/*
* The VP assist page is an "overlay" page (see Hyper-V TLFS's
* Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
* out to make sure we always write the EOI MSR in
* hv_apic_eoi_write() *after* the EOI optimization is disabled
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
if (hv_root_partition) {
/*
* For root partition we get the hypervisor provided VP assist
* page, instead of allocating a new page.
*/
rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
*hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
PAGE_SIZE, MEMREMAP_WB);
} else {
/*
* The VP assist page is an "overlay" page (see Hyper-V TLFS's
* Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
* out to make sure we always write the EOI MSR in
* hv_apic_eoi_write() *after* the EOI optimization is disabled
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
if (!*hvp)
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
}
WARN_ON(!(*hvp));
if (*hvp) {
msr.enable = 1;
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
}
if (!WARN_ON(!(*hvp))) {
msr.enable = 1;
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
return hyperv_init_ghcb();
......
......@@ -533,13 +533,17 @@ static void vmbus_add_channel_work(struct work_struct *work)
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
*
* If vmbus_device_register() fails, the 'device_obj' is freed in
* vmbus_device_release() as called by device_unregister() in the
* error path of vmbus_device_register(). In the outside error
* path, there's no need to free it.
*/
ret = vmbus_device_register(newchannel->device_obj);
if (ret != 0) {
pr_err("unable to add child device object (relid %d)\n",
newchannel->offermsg.child_relid);
kfree(newchannel->device_obj);
goto err_deq_chan;
}
......
......@@ -2082,6 +2082,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
ret = device_register(&child_device_obj->device);
if (ret) {
pr_err("Unable to register child device\n");
put_device(&child_device_obj->device);
return ret;
}
......
......@@ -1613,7 +1613,7 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp,
}
static u32 hv_compose_msi_req_v1(
struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
struct pci_create_interrupt *int_pkt,
u32 slot, u8 vector, u16 vector_count)
{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
......@@ -1631,6 +1631,35 @@ static u32 hv_compose_msi_req_v1(
return sizeof(*int_pkt);
}
/*
* The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
* hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
* interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
* via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
* not irrelevant because Hyper-V chooses the physical CPU to handle the
* interrupts based on the vCPU specified in message sent to the vPCI VSP in
* hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
* but assigning too many vPCI device interrupts to the same pCPU can cause a
* performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
* to spread out the pCPUs that it selects.
*
* For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
* to always return the same dummy vCPU, because a second call to
* hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
* new pCPU for the interrupt. But for the multi-MSI case, the second call to
* hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
* original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
* the pCPUs are spread out. All interrupts for a multi-MSI device end up using
* the same pCPU, even though the vCPUs will be spread out by later calls
* to hv_irq_unmask(), but that is the best we can do now.
*
* With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
* cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
* enhancement is planned for a future version. With that enhancement, the
* dummy vCPU selection won't matter, and interrupts for the same multi-MSI
* device will be spread across multiple pCPUs.
*/
/*
* Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
* by subsequent retarget in hv_irq_unmask().
......@@ -1640,18 +1669,39 @@ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
return cpumask_first_and(affinity, cpu_online_mask);
}
static u32 hv_compose_msi_req_v2(
struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
u32 slot, u8 vector, u16 vector_count)
/*
* Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
*/
static int hv_compose_multi_msi_req_get_cpu(void)
{
static DEFINE_SPINLOCK(multi_msi_cpu_lock);
/* -1 means starting with CPU 0 */
static int cpu_next = -1;
unsigned long flags;
int cpu;
spin_lock_irqsave(&multi_msi_cpu_lock, flags);
cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
false);
cpu = cpu_next;
spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);
return cpu;
}
static u32 hv_compose_msi_req_v2(
struct pci_create_interrupt2 *int_pkt, int cpu,
u32 slot, u8 vector, u16 vector_count)
{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
......@@ -1660,18 +1710,15 @@ static u32 hv_compose_msi_req_v2(
}
static u32 hv_compose_msi_req_v3(
struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
struct pci_create_interrupt3 *int_pkt, int cpu,
u32 slot, u32 vector, u16 vector_count)
{
int cpu;
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
int_pkt->int_desc.reserved = 0;
int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
......@@ -1715,12 +1762,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct pci_create_interrupt3 v3;
} int_pkts;
} __packed ctxt;
bool multi_msi;
u64 trans_id;
u32 size;
int ret;
int cpu;
msi_desc = irq_data_get_msi_desc(data);
multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
msi_desc->nvec_used > 1;
/* Reuse the previous allocation */
if (data->chip_data) {
if (data->chip_data && multi_msi) {
int_desc = data->chip_data;
msg->address_hi = int_desc->address >> 32;
msg->address_lo = int_desc->address & 0xffffffff;
......@@ -1728,7 +1781,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
msi_desc = irq_data_get_msi_desc(data);
pdev = msi_desc_to_pci_dev(msi_desc);
dest = irq_data_get_effective_affinity_mask(data);
pbus = pdev->bus;
......@@ -1738,11 +1790,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
if (!hpdev)
goto return_null_message;
/* Free any previous message that might have already been composed. */
if (data->chip_data && !multi_msi) {
int_desc = data->chip_data;
data->chip_data = NULL;
hv_int_desc_free(hpdev, int_desc);
}
int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
if (!int_desc)
goto drop_reference;
if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
if (multi_msi) {
/*
* If this is not the first MSI of Multi MSI, we already have
* a mapping. Can exit early.
......@@ -1767,9 +1826,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
vector = 32;
vector_count = msi_desc->nvec_used;
cpu = hv_compose_multi_msi_req_get_cpu();
} else {
vector = hv_msi_get_int_vector(data);
vector_count = 1;
cpu = hv_compose_msi_req_get_cpu(dest);
}
/*
......@@ -1785,7 +1846,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
switch (hbus->protocol_version) {
case PCI_PROTOCOL_VERSION_1_1:
size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
dest,
hpdev->desc.win_slot.slot,
(u8)vector,
vector_count);
......@@ -1794,7 +1854,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
case PCI_PROTOCOL_VERSION_1_2:
case PCI_PROTOCOL_VERSION_1_3:
size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
dest,
cpu,
hpdev->desc.win_slot.slot,
(u8)vector,
vector_count);
......@@ -1802,7 +1862,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
case PCI_PROTOCOL_VERSION_1_4:
size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
dest,
cpu,
hpdev->desc.win_slot.slot,
vector,
vector_count);
......
......@@ -303,16 +303,21 @@ enum storvsc_request_type {
};
/*
* SRB status codes and masks; a subset of the codes used here.
* SRB status codes and masks. In the 8-bit field, the two high order bits
* are flags, while the remaining 6 bits are an integer status code. The
* definitions here include only the subset of the integer status codes that
* are tested for in this driver.
*/
#define SRB_STATUS_AUTOSENSE_VALID 0x80
#define SRB_STATUS_QUEUE_FROZEN 0x40
#define SRB_STATUS_INVALID_LUN 0x20
#define SRB_STATUS_SUCCESS 0x01
#define SRB_STATUS_ABORTED 0x02
#define SRB_STATUS_ERROR 0x04
#define SRB_STATUS_DATA_OVERRUN 0x12
/* SRB status integer codes */
#define SRB_STATUS_SUCCESS 0x01
#define SRB_STATUS_ABORTED 0x02
#define SRB_STATUS_ERROR 0x04
#define SRB_STATUS_INVALID_REQUEST 0x06
#define SRB_STATUS_DATA_OVERRUN 0x12
#define SRB_STATUS_INVALID_LUN 0x20
#define SRB_STATUS(status) \
(status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
......@@ -969,38 +974,25 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
void (*process_err_fn)(struct work_struct *work);
struct hv_host_device *host_dev = shost_priv(host);
/*
* In some situations, Hyper-V sets multiple bits in the
* srb_status, such as ABORTED and ERROR. So process them
* individually, with the most specific bits first.
*/
if (vm_srb->srb_status & SRB_STATUS_INVALID_LUN) {
set_host_byte(scmnd, DID_NO_CONNECT);
process_err_fn = storvsc_remove_lun;
goto do_work;
}
switch (SRB_STATUS(vm_srb->srb_status)) {
case SRB_STATUS_ERROR:
case SRB_STATUS_ABORTED:
case SRB_STATUS_INVALID_REQUEST:
if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) {
/* Check for capacity change */
if ((asc == 0x2a) && (ascq == 0x9)) {
process_err_fn = storvsc_device_scan;
/* Retry the I/O that triggered this. */
set_host_byte(scmnd, DID_REQUEUE);
goto do_work;
}
if (vm_srb->srb_status & SRB_STATUS_ABORTED) {
if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID &&
/* Capacity data has changed */
(asc == 0x2a) && (ascq == 0x9)) {
process_err_fn = storvsc_device_scan;
/*
* Retry the I/O that triggered this.
* Otherwise, let upper layer deal with the
* error when sense message is present
*/
set_host_byte(scmnd, DID_REQUEUE);
goto do_work;
}
}
if (vm_srb->srb_status & SRB_STATUS_ERROR) {
/*
* Let upper layer deal with error when
* sense message is present.
*/
if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)
return;
}
/*
* If there is an error; offline the device since all
......@@ -1023,6 +1015,13 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
default:
set_host_byte(scmnd, DID_ERROR);
}
return;
case SRB_STATUS_INVALID_LUN:
set_host_byte(scmnd, DID_NO_CONNECT);
process_err_fn = storvsc_remove_lun;
goto do_work;
}
return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment