Commit 764bfd13 authored by farah kassabri's avatar farah kassabri Committed by Oded Gabbay

accel/habanalabs/gaudi2: add eq health check using irq

This is the second patch for applying the eq health check mechanism
which will add support for the interrupt flow for gaudi2 asic.

More info about the interrupt mechanism:
set a dedicated msix for the eq error interrupt, and add
interrupt handler for it.
when FW detects some issue with EQ like EQ_FULL, it'll
raise that interrupt and driver should reset the device.
Driver will inform the FW which msix index to use through
the already existing handshake mechanism which will
send msix info message to fw.
Signed-off-by: default avatarfarah kassabri <fkassabri@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 7c4130e6
...@@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg); ...@@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg); irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg); irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg); irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
u32 hl_cq_inc_ptr(u32 ptr); u32 hl_cq_inc_ptr(u32 ptr);
int hl_asid_init(struct hl_device *hdev); int hl_asid_init(struct hl_device *hdev);
......
...@@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg) ...@@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
return IRQ_HANDLED; return IRQ_HANDLED;
} }
irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
{
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct hl_device *hdev = arg;
dev_err(hdev->dev, "EQ error interrupt received\n");
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
return IRQ_HANDLED;
}
/** /**
* hl_irq_handler_eq - irq handler for event queue * hl_irq_handler_eq - irq handler for event queue
* *
......
...@@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number) ...@@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
return "gaudi2 unexpected error"; return "gaudi2 unexpected error";
case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST: case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
return "gaudi2 user completion"; return "gaudi2 user completion";
case GAUDI2_IRQ_NUM_EQ_ERROR:
return "gaudi2 eq error";
default: default:
return "invalid"; return "invalid";
} }
...@@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev) ...@@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
} }
} }
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
hdev);
if (rc) {
dev_err(hdev->dev, "Failed to request IRQ %d", irq);
goto free_user_irq;
}
gaudi2->hw_cap_initialized |= HW_CAP_MSIX; gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
return 0; return 0;
...@@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev) ...@@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
} }
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE)); synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
} }
static void gaudi2_disable_msix(struct hl_device *hdev) static void gaudi2_disable_msix(struct hl_device *hdev)
...@@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev) ...@@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION]; cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
free_irq(irq, cq); free_irq(irq, cq);
irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
free_irq(irq, hdev);
pci_free_irq_vectors(hdev->pdev); pci_free_irq_vectors(hdev->pdev);
gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX; gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
...@@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64 ...@@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
static void gaudi2_get_msi_info(__le32 *table) static void gaudi2_get_msi_info(__le32 *table)
{ {
table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX); table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
} }
static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx) static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)
......
...@@ -419,6 +419,7 @@ enum gaudi2_irq_num { ...@@ -419,6 +419,7 @@ enum gaudi2_irq_num {
GAUDI2_IRQ_NUM_NIC_PORT_FIRST, GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1), GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
GAUDI2_IRQ_NUM_TPC_ASSERT, GAUDI2_IRQ_NUM_TPC_ASSERT,
GAUDI2_IRQ_NUM_EQ_ERROR,
GAUDI2_IRQ_NUM_RESERVED_FIRST, GAUDI2_IRQ_NUM_RESERVED_FIRST,
GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1), GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
......
...@@ -1004,6 +1004,7 @@ enum cpucp_msi_type { ...@@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
CPUCP_NIC_PORT5_MSI_TYPE, CPUCP_NIC_PORT5_MSI_TYPE,
CPUCP_NIC_PORT7_MSI_TYPE, CPUCP_NIC_PORT7_MSI_TYPE,
CPUCP_NIC_PORT9_MSI_TYPE, CPUCP_NIC_PORT9_MSI_TYPE,
CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
CPUCP_NUM_OF_MSI_TYPES CPUCP_NUM_OF_MSI_TYPES
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment