Commit 254fac6d authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs/gaudi: add FW alive event support

In order for driver to be aware of process or thread crashes inside
GAUDI's CPU, we introduce a new event which contains all relevant
information. Upon event reception, driver will dump information and
will reset the device.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent a3972581
...@@ -7451,6 +7451,16 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev, ...@@ -7451,6 +7451,16 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
} }
static void gaudi_print_fw_alive_info(struct hl_device *hdev,
struct hl_eq_fw_alive *fw_alive)
{
dev_err(hdev->dev,
"FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n",
(fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ?
"Minor" : "Critical", fw_alive->process_id,
fw_alive->thread_id, fw_alive->uptime_seconds);
}
static int gaudi_soft_reset_late_init(struct hl_device *hdev) static int gaudi_soft_reset_late_init(struct hl_device *hdev)
{ {
struct gaudi_device *gaudi = hdev->asic_specific; struct gaudi_device *gaudi = hdev->asic_specific;
...@@ -7902,6 +7912,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7902,6 +7912,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
goto reset_device; goto reset_device;
case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
goto reset_device;
default: default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n", dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type); event_type);
......
...@@ -84,6 +84,20 @@ struct hl_eq_sm_sei_data { ...@@ -84,6 +84,20 @@ struct hl_eq_sm_sei_data {
__u8 pad[3]; __u8 pad[3];
}; };
enum hl_fw_alive_severity {
FW_ALIVE_SEVERITY_MINOR,
FW_ALIVE_SEVERITY_CRITICAL
};
struct hl_eq_fw_alive {
__le64 uptime_seconds;
__le32 process_id;
__le32 thread_id;
/* enum hl_fw_alive_severity */
__u8 severity;
__u8 pad[7];
};
struct hl_eq_entry { struct hl_eq_entry {
struct hl_eq_header hdr; struct hl_eq_header hdr;
union { union {
...@@ -91,6 +105,7 @@ struct hl_eq_entry { ...@@ -91,6 +105,7 @@ struct hl_eq_entry {
struct hl_eq_hbm_ecc_data hbm_ecc_data; struct hl_eq_hbm_ecc_data hbm_ecc_data;
struct hl_eq_sm_sei_data sm_sei_data; struct hl_eq_sm_sei_data sm_sei_data;
struct cpucp_pkt_sync_err pkt_sync_err; struct cpucp_pkt_sync_err pkt_sync_err;
struct hl_eq_fw_alive fw_alive;
__le64 data[7]; __le64 data[7];
}; };
}; };
......
...@@ -303,6 +303,7 @@ enum gaudi_async_event_id { ...@@ -303,6 +303,7 @@ enum gaudi_async_event_id {
GAUDI_EVENT_NIC3_QP1 = 619, GAUDI_EVENT_NIC3_QP1 = 619,
GAUDI_EVENT_NIC4_QP0 = 620, GAUDI_EVENT_NIC4_QP0 = 620,
GAUDI_EVENT_NIC4_QP1 = 621, GAUDI_EVENT_NIC4_QP1 = 621,
GAUDI_EVENT_FW_ALIVE_S = 645,
GAUDI_EVENT_DEV_RESET_REQ = 646, GAUDI_EVENT_DEV_RESET_REQ = 646,
GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647, GAUDI_EVENT_PKT_QUEUE_OUT_SYNC = 647,
GAUDI_EVENT_FIX_POWER_ENV_S = 658, GAUDI_EVENT_FIX_POWER_ENV_S = 658,
......
...@@ -669,7 +669,7 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = { ...@@ -669,7 +669,7 @@ static struct gaudi_async_events_ids_map gaudi_irq_map_table[] = {
{ .fc_id = 642, .cpu_id = 491, .valid = 0, .name = "" }, { .fc_id = 642, .cpu_id = 491, .valid = 0, .name = "" },
{ .fc_id = 643, .cpu_id = 492, .valid = 0, .name = "" }, { .fc_id = 643, .cpu_id = 492, .valid = 0, .name = "" },
{ .fc_id = 644, .cpu_id = 493, .valid = 0, .name = "" }, { .fc_id = 644, .cpu_id = 493, .valid = 0, .name = "" },
{ .fc_id = 645, .cpu_id = 494, .valid = 0, .name = "" }, { .fc_id = 645, .cpu_id = 494, .valid = 1, .name = "FW_ALIVE_S" },
{ .fc_id = 646, .cpu_id = 495, .valid = 1, .name = "DEV_RESET_REQ" }, { .fc_id = 646, .cpu_id = 495, .valid = 1, .name = "DEV_RESET_REQ" },
{ .fc_id = 647, .cpu_id = 496, .valid = 1, { .fc_id = 647, .cpu_id = 496, .valid = 1,
.name = "PKT_QUEUE_OUT_SYNC" }, .name = "PKT_QUEUE_OUT_SYNC" },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment