Commit c6a4f256 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

accel/habanalabs: notify user about undefined opcode event

In order for user to be aware of undefined opcode events, we must
store all relevant information and notify user about the failure.
The user will fetch the stored info via info ioctl.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent a35c9976
...@@ -993,6 +993,111 @@ gaudi2_pcie_addr_dec_error_cause[GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE] = { ...@@ -993,6 +993,111 @@ gaudi2_pcie_addr_dec_error_cause[GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE] = {
"TLP is blocked by RR" "TLP is blocked by RR"
}; };
static const int gaudi2_queue_id_to_engine_id[] = {
[GAUDI2_QUEUE_ID_PDMA_0_0...GAUDI2_QUEUE_ID_PDMA_0_3] = GAUDI2_ENGINE_ID_PDMA_0,
[GAUDI2_QUEUE_ID_PDMA_1_0...GAUDI2_QUEUE_ID_PDMA_1_3] = GAUDI2_ENGINE_ID_PDMA_1,
[GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_0_3] =
GAUDI2_DCORE0_ENGINE_ID_EDMA_0,
[GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3] =
GAUDI2_DCORE0_ENGINE_ID_EDMA_1,
[GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_0_3] =
GAUDI2_DCORE1_ENGINE_ID_EDMA_0,
[GAUDI2_QUEUE_ID_DCORE1_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3] =
GAUDI2_DCORE1_ENGINE_ID_EDMA_1,
[GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_0_3] =
GAUDI2_DCORE2_ENGINE_ID_EDMA_0,
[GAUDI2_QUEUE_ID_DCORE2_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3] =
GAUDI2_DCORE2_ENGINE_ID_EDMA_1,
[GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_0_3] =
GAUDI2_DCORE3_ENGINE_ID_EDMA_0,
[GAUDI2_QUEUE_ID_DCORE3_EDMA_1_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3] =
GAUDI2_DCORE3_ENGINE_ID_EDMA_1,
[GAUDI2_QUEUE_ID_DCORE0_MME_0_0...GAUDI2_QUEUE_ID_DCORE0_MME_0_3] =
GAUDI2_DCORE0_ENGINE_ID_MME,
[GAUDI2_QUEUE_ID_DCORE1_MME_0_0...GAUDI2_QUEUE_ID_DCORE1_MME_0_3] =
GAUDI2_DCORE1_ENGINE_ID_MME,
[GAUDI2_QUEUE_ID_DCORE2_MME_0_0...GAUDI2_QUEUE_ID_DCORE2_MME_0_3] =
GAUDI2_DCORE2_ENGINE_ID_MME,
[GAUDI2_QUEUE_ID_DCORE3_MME_0_0...GAUDI2_QUEUE_ID_DCORE3_MME_0_3] =
GAUDI2_DCORE3_ENGINE_ID_MME,
[GAUDI2_QUEUE_ID_DCORE0_TPC_0_0...GAUDI2_QUEUE_ID_DCORE0_TPC_0_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_0,
[GAUDI2_QUEUE_ID_DCORE0_TPC_1_0...GAUDI2_QUEUE_ID_DCORE0_TPC_1_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_1,
[GAUDI2_QUEUE_ID_DCORE0_TPC_2_0...GAUDI2_QUEUE_ID_DCORE0_TPC_2_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_2,
[GAUDI2_QUEUE_ID_DCORE0_TPC_3_0...GAUDI2_QUEUE_ID_DCORE0_TPC_3_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_3,
[GAUDI2_QUEUE_ID_DCORE0_TPC_4_0...GAUDI2_QUEUE_ID_DCORE0_TPC_4_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_4,
[GAUDI2_QUEUE_ID_DCORE0_TPC_5_0...GAUDI2_QUEUE_ID_DCORE0_TPC_5_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_5,
[GAUDI2_QUEUE_ID_DCORE0_TPC_6_0...GAUDI2_QUEUE_ID_DCORE0_TPC_6_3] =
GAUDI2_DCORE0_ENGINE_ID_TPC_6,
[GAUDI2_QUEUE_ID_DCORE1_TPC_0_0...GAUDI2_QUEUE_ID_DCORE1_TPC_0_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_0,
[GAUDI2_QUEUE_ID_DCORE1_TPC_1_0...GAUDI2_QUEUE_ID_DCORE1_TPC_1_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_1,
[GAUDI2_QUEUE_ID_DCORE1_TPC_2_0...GAUDI2_QUEUE_ID_DCORE1_TPC_2_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_2,
[GAUDI2_QUEUE_ID_DCORE1_TPC_3_0...GAUDI2_QUEUE_ID_DCORE1_TPC_3_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_3,
[GAUDI2_QUEUE_ID_DCORE1_TPC_4_0...GAUDI2_QUEUE_ID_DCORE1_TPC_4_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_4,
[GAUDI2_QUEUE_ID_DCORE1_TPC_5_0...GAUDI2_QUEUE_ID_DCORE1_TPC_5_3] =
GAUDI2_DCORE1_ENGINE_ID_TPC_5,
[GAUDI2_QUEUE_ID_DCORE2_TPC_0_0...GAUDI2_QUEUE_ID_DCORE2_TPC_0_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_0,
[GAUDI2_QUEUE_ID_DCORE2_TPC_1_0...GAUDI2_QUEUE_ID_DCORE2_TPC_1_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_1,
[GAUDI2_QUEUE_ID_DCORE2_TPC_2_0...GAUDI2_QUEUE_ID_DCORE2_TPC_2_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_2,
[GAUDI2_QUEUE_ID_DCORE2_TPC_3_0...GAUDI2_QUEUE_ID_DCORE2_TPC_3_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_3,
[GAUDI2_QUEUE_ID_DCORE2_TPC_4_0...GAUDI2_QUEUE_ID_DCORE2_TPC_4_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_4,
[GAUDI2_QUEUE_ID_DCORE2_TPC_5_0...GAUDI2_QUEUE_ID_DCORE2_TPC_5_3] =
GAUDI2_DCORE2_ENGINE_ID_TPC_5,
[GAUDI2_QUEUE_ID_DCORE3_TPC_0_0...GAUDI2_QUEUE_ID_DCORE3_TPC_0_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_0,
[GAUDI2_QUEUE_ID_DCORE3_TPC_1_0...GAUDI2_QUEUE_ID_DCORE3_TPC_1_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_1,
[GAUDI2_QUEUE_ID_DCORE3_TPC_2_0...GAUDI2_QUEUE_ID_DCORE3_TPC_2_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_2,
[GAUDI2_QUEUE_ID_DCORE3_TPC_3_0...GAUDI2_QUEUE_ID_DCORE3_TPC_3_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_3,
[GAUDI2_QUEUE_ID_DCORE3_TPC_4_0...GAUDI2_QUEUE_ID_DCORE3_TPC_4_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_4,
[GAUDI2_QUEUE_ID_DCORE3_TPC_5_0...GAUDI2_QUEUE_ID_DCORE3_TPC_5_3] =
GAUDI2_DCORE3_ENGINE_ID_TPC_5,
[GAUDI2_QUEUE_ID_NIC_0_0...GAUDI2_QUEUE_ID_NIC_0_3] = GAUDI2_ENGINE_ID_NIC0_0,
[GAUDI2_QUEUE_ID_NIC_1_0...GAUDI2_QUEUE_ID_NIC_1_3] = GAUDI2_ENGINE_ID_NIC0_1,
[GAUDI2_QUEUE_ID_NIC_2_0...GAUDI2_QUEUE_ID_NIC_2_3] = GAUDI2_ENGINE_ID_NIC1_0,
[GAUDI2_QUEUE_ID_NIC_3_0...GAUDI2_QUEUE_ID_NIC_3_3] = GAUDI2_ENGINE_ID_NIC1_1,
[GAUDI2_QUEUE_ID_NIC_4_0...GAUDI2_QUEUE_ID_NIC_4_3] = GAUDI2_ENGINE_ID_NIC2_0,
[GAUDI2_QUEUE_ID_NIC_5_0...GAUDI2_QUEUE_ID_NIC_5_3] = GAUDI2_ENGINE_ID_NIC2_1,
[GAUDI2_QUEUE_ID_NIC_6_0...GAUDI2_QUEUE_ID_NIC_6_3] = GAUDI2_ENGINE_ID_NIC3_0,
[GAUDI2_QUEUE_ID_NIC_7_0...GAUDI2_QUEUE_ID_NIC_7_3] = GAUDI2_ENGINE_ID_NIC3_1,
[GAUDI2_QUEUE_ID_NIC_8_0...GAUDI2_QUEUE_ID_NIC_8_3] = GAUDI2_ENGINE_ID_NIC4_0,
[GAUDI2_QUEUE_ID_NIC_9_0...GAUDI2_QUEUE_ID_NIC_9_3] = GAUDI2_ENGINE_ID_NIC4_1,
[GAUDI2_QUEUE_ID_NIC_10_0...GAUDI2_QUEUE_ID_NIC_10_3] = GAUDI2_ENGINE_ID_NIC5_0,
[GAUDI2_QUEUE_ID_NIC_11_0...GAUDI2_QUEUE_ID_NIC_11_3] = GAUDI2_ENGINE_ID_NIC5_1,
[GAUDI2_QUEUE_ID_NIC_12_0...GAUDI2_QUEUE_ID_NIC_12_3] = GAUDI2_ENGINE_ID_NIC6_0,
[GAUDI2_QUEUE_ID_NIC_13_0...GAUDI2_QUEUE_ID_NIC_13_3] = GAUDI2_ENGINE_ID_NIC6_1,
[GAUDI2_QUEUE_ID_NIC_14_0...GAUDI2_QUEUE_ID_NIC_14_3] = GAUDI2_ENGINE_ID_NIC7_0,
[GAUDI2_QUEUE_ID_NIC_15_0...GAUDI2_QUEUE_ID_NIC_15_3] = GAUDI2_ENGINE_ID_NIC7_1,
[GAUDI2_QUEUE_ID_NIC_16_0...GAUDI2_QUEUE_ID_NIC_16_3] = GAUDI2_ENGINE_ID_NIC8_0,
[GAUDI2_QUEUE_ID_NIC_17_0...GAUDI2_QUEUE_ID_NIC_17_3] = GAUDI2_ENGINE_ID_NIC8_1,
[GAUDI2_QUEUE_ID_NIC_18_0...GAUDI2_QUEUE_ID_NIC_18_3] = GAUDI2_ENGINE_ID_NIC9_0,
[GAUDI2_QUEUE_ID_NIC_19_0...GAUDI2_QUEUE_ID_NIC_19_3] = GAUDI2_ENGINE_ID_NIC9_1,
[GAUDI2_QUEUE_ID_NIC_20_0...GAUDI2_QUEUE_ID_NIC_20_3] = GAUDI2_ENGINE_ID_NIC10_0,
[GAUDI2_QUEUE_ID_NIC_21_0...GAUDI2_QUEUE_ID_NIC_21_3] = GAUDI2_ENGINE_ID_NIC10_1,
[GAUDI2_QUEUE_ID_NIC_22_0...GAUDI2_QUEUE_ID_NIC_22_3] = GAUDI2_ENGINE_ID_NIC11_0,
[GAUDI2_QUEUE_ID_NIC_23_0...GAUDI2_QUEUE_ID_NIC_23_3] = GAUDI2_ENGINE_ID_NIC11_1,
[GAUDI2_QUEUE_ID_ROT_0_0...GAUDI2_QUEUE_ID_ROT_0_3] = GAUDI2_ENGINE_ID_ROT_0,
[GAUDI2_QUEUE_ID_ROT_1_0...GAUDI2_QUEUE_ID_ROT_1_3] = GAUDI2_ENGINE_ID_ROT_1,
};
const u32 gaudi2_qm_blocks_bases[GAUDI2_QUEUE_ID_SIZE] = { const u32 gaudi2_qm_blocks_bases[GAUDI2_QUEUE_ID_SIZE] = {
[GAUDI2_QUEUE_ID_PDMA_0_0] = mmPDMA0_QM_BASE, [GAUDI2_QUEUE_ID_PDMA_0_0] = mmPDMA0_QM_BASE,
[GAUDI2_QUEUE_ID_PDMA_0_1] = mmPDMA0_QM_BASE, [GAUDI2_QUEUE_ID_PDMA_0_1] = mmPDMA0_QM_BASE,
...@@ -7753,7 +7858,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, ...@@ -7753,7 +7858,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
return !!ecc_data->is_critical; return !!ecc_data->is_critical;
} }
static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base) static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
{ {
u32 lo, hi, cq_ptr_size, arc_cq_ptr_size; u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
u64 cq_ptr, arc_cq_ptr, cp_current_inst; u64 cq_ptr, arc_cq_ptr, cp_current_inst;
...@@ -7775,10 +7880,22 @@ static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base) ...@@ -7775,10 +7880,22 @@ static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base)
dev_info(hdev->dev, dev_info(hdev->dev,
"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst); cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
if (arc_cq_ptr) {
hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr;
hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
} else {
hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
}
hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
}
} }
static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type, static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type,
u64 qman_base, u32 qid_base) u64 qman_base, u32 qid_base, u64 *event_mask)
{ {
u32 i, j, glbl_sts_val, arb_err_val, num_error_causes, error_count = 0; u32 i, j, glbl_sts_val, arb_err_val, num_error_causes, error_count = 0;
u64 glbl_sts_addr, arb_err_addr; u64 glbl_sts_addr, arb_err_addr;
...@@ -7812,8 +7929,22 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type ...@@ -7812,8 +7929,22 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
error_count++; error_count++;
} }
if (i == QMAN_STREAMS) if (i == QMAN_STREAMS && error_count) {
print_lower_qman_data_on_err(hdev, qman_base); /* check for undefined opcode */
if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK &&
hdev->captured_err_info.undef_opcode.write_enable) {
memset(&hdev->captured_err_info.undef_opcode, 0,
sizeof(hdev->captured_err_info.undef_opcode));
hdev->captured_err_info.undef_opcode.write_enable = false;
hdev->captured_err_info.undef_opcode.timestamp = ktime_get();
hdev->captured_err_info.undef_opcode.engine_id =
gaudi2_queue_id_to_engine_id[qid_base];
*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
}
handle_lower_qman_data_on_err(hdev, qman_base, *event_mask);
}
} }
arb_err_val = RREG32(arb_err_addr); arb_err_val = RREG32(arb_err_addr);
...@@ -8475,7 +8606,8 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e ...@@ -8475,7 +8606,8 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
return 0; return 0;
} }
error_count = gaudi2_handle_qman_err_generic(hdev, event_type, qman_base, qid_base); error_count = gaudi2_handle_qman_err_generic(hdev, event_type, qman_base,
qid_base, event_mask);
/* Handle EDMA QM SEI here because there is no AXI error response event for EDMA */ /* Handle EDMA QM SEI here because there is no AXI error response event for EDMA */
if (event_type >= GAUDI2_EVENT_HDMA2_QM && event_type <= GAUDI2_EVENT_HDMA5_QM) { if (event_type >= GAUDI2_EVENT_HDMA2_QM && event_type <= GAUDI2_EVENT_HDMA5_QM) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment