Commit 7c4130e6 authored by farah kassabri's avatar farah kassabri Committed by Oded Gabbay

accel/habanalabs/gaudi2: handle eq health heartbeat check

Add mechanism for fw eq health check. this will be done using two flows:
using the heartbeat mechanism and raising a dedicated interrupt to
indicate an eq failure like EQ full.
This patch will add implementation for the eq heartbeat for gaudi2 asic.

More info about the heartbeat mechanism:
Expand the heartbeat mechanism to monitor a new event that
will be sent from FW upon receiving heartbeat message.
that way driver can know that the eq is working or not.
Signed-off-by: default avatarfarah kassabri <fkassabri@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 72bff371
...@@ -989,6 +989,25 @@ static bool is_pci_link_healthy(struct hl_device *hdev) ...@@ -989,6 +989,25 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (vendor_id == PCI_VENDOR_ID_HABANALABS); return (vendor_id == PCI_VENDOR_ID_HABANALABS);
} }
static void hl_device_eq_heartbeat(struct hl_device *hdev)
{
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct asic_fixed_properties *prop = &hdev->asic_prop;
/*
* This feature supported in FW version 1.12.0 45.2.0 and above,
* only on those FW versions eq_health_check_supported will be set.
* Start checking eq health only after driver has enabled events from FW.
*/
if (!prop->cpucp_info.eq_health_check_supported || !hdev->init_done)
return;
if (hdev->eq_heartbeat_received)
hdev->eq_heartbeat_received = false;
else
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
}
static void hl_device_heartbeat(struct work_struct *work) static void hl_device_heartbeat(struct work_struct *work)
{ {
struct hl_device *hdev = container_of(work, struct hl_device, struct hl_device *hdev = container_of(work, struct hl_device,
...@@ -999,6 +1018,12 @@ static void hl_device_heartbeat(struct work_struct *work) ...@@ -999,6 +1018,12 @@ static void hl_device_heartbeat(struct work_struct *work)
if (!hl_device_operational(hdev, NULL)) if (!hl_device_operational(hdev, NULL))
goto reschedule; goto reschedule;
/*
* For EQ health check need to check if driver received the heartbeat eq event
* in order to validate the eq is working.
*/
hl_device_eq_heartbeat(hdev);
if (!hdev->asic_funcs->send_heartbeat(hdev)) if (!hdev->asic_funcs->send_heartbeat(hdev))
goto reschedule; goto reschedule;
...@@ -1055,7 +1080,15 @@ static int device_late_init(struct hl_device *hdev) ...@@ -1055,7 +1080,15 @@ static int device_late_init(struct hl_device *hdev)
hdev->high_pll = hdev->asic_prop.high_pll; hdev->high_pll = hdev->asic_prop.high_pll;
if (hdev->heartbeat) { if (hdev->heartbeat) {
/*
* Before scheduling the heartbeat driver will check if eq event has received.
* for the first schedule we need to set the indication as true then for the next
* one this indication will be true only if eq event was sent by FW.
*/
hdev->eq_heartbeat_received = true;
INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
schedule_delayed_work(&hdev->work_heartbeat, schedule_delayed_work(&hdev->work_heartbeat,
usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
} }
...@@ -2235,8 +2268,6 @@ int hl_device_init(struct hl_device *hdev) ...@@ -2235,8 +2268,6 @@ int hl_device_init(struct hl_device *hdev)
"Successfully added device %s to habanalabs driver\n", "Successfully added device %s to habanalabs driver\n",
dev_name(&(hdev)->pdev->dev)); dev_name(&(hdev)->pdev->dev));
hdev->init_done = true;
/* After initialization is done, we are ready to receive events from /* After initialization is done, we are ready to receive events from
* the F/W. We can't do it before because we will ignore events and if * the F/W. We can't do it before because we will ignore events and if
* those events are fatal, we won't know about it and the device will * those events are fatal, we won't know about it and the device will
...@@ -2244,6 +2275,8 @@ int hl_device_init(struct hl_device *hdev) ...@@ -2244,6 +2275,8 @@ int hl_device_init(struct hl_device *hdev)
*/ */
hdev->asic_funcs->enable_events_from_fw(hdev); hdev->asic_funcs->enable_events_from_fw(hdev);
hdev->init_done = true;
return 0; return 0;
cb_pool_fini: cb_pool_fini:
......
...@@ -3314,6 +3314,7 @@ struct hl_reset_info { ...@@ -3314,6 +3314,7 @@ struct hl_reset_info {
* device. * device.
* @supports_ctx_switch: true if a ctx switch is required upon first submission. * @supports_ctx_switch: true if a ctx switch is required upon first submission.
* @support_preboot_binning: true if we support read binning info from preboot. * @support_preboot_binning: true if we support read binning info from preboot.
* @eq_heartbeat_received: indication that eq heartbeat event has received from FW.
* @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing. * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing.
* @fw_components: Controls which f/w components to load to the device. There are multiple f/w * @fw_components: Controls which f/w components to load to the device. There are multiple f/w
* stages and sometimes we want to stop at a certain stage. Used only for testing. * stages and sometimes we want to stop at a certain stage. Used only for testing.
...@@ -3474,6 +3475,7 @@ struct hl_device { ...@@ -3474,6 +3475,7 @@ struct hl_device {
u8 reset_upon_device_release; u8 reset_upon_device_release;
u8 supports_ctx_switch; u8 supports_ctx_switch;
u8 support_preboot_binning; u8 support_preboot_binning;
u8 eq_heartbeat_received;
/* Parameters for bring-up to be upstreamed */ /* Parameters for bring-up to be upstreamed */
u64 nic_ports_mask; u64 nic_ports_mask;
......
...@@ -7804,6 +7804,7 @@ static inline bool is_info_event(u32 event) ...@@ -7804,6 +7804,7 @@ static inline bool is_info_event(u32 event)
* an indication to an error. * an indication to an error.
*/ */
case GAUDI2_EVENT_CPU0_STATUS_NIC0_ENG0 ... GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1: case GAUDI2_EVENT_CPU0_STATUS_NIC0_ENG0 ... GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1:
case GAUDI2_EVENT_ARC_EQ_HEARTBEAT:
return true; return true;
default: default:
return false; return false;
...@@ -9765,6 +9766,11 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type) ...@@ -9765,6 +9766,11 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
return U16_MAX; return U16_MAX;
} }
static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
hdev->eq_heartbeat_received = true;
}
static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{ {
struct gaudi2_device *gaudi2 = hdev->asic_specific; struct gaudi2_device *gaudi2 = hdev->asic_specific;
...@@ -10190,6 +10196,10 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent ...@@ -10190,6 +10196,10 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
gaudi2_irq_map_table[event_type].name); gaudi2_irq_map_table[event_type].name);
break; break;
case GAUDI2_EVENT_ARC_EQ_HEARTBEAT:
hl_eq_heartbeat_event_handle(hdev);
error_count = GAUDI2_NA_EVENT_CAUSE;
break;
default: default:
if (gaudi2_irq_map_table[event_type].valid) { if (gaudi2_irq_map_table[event_type].valid) {
dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n", dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n",
......
...@@ -2674,17 +2674,19 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = { ...@@ -2674,17 +2674,19 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 1321, .cpu_id = 627, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_HARD, { .fc_id = 1321, .cpu_id = 627, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_HARD,
.name = "DEV_RESET_REQ" }, .name = "DEV_RESET_REQ" },
{ .fc_id = 1322, .cpu_id = 628, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1322, .cpu_id = 628, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_BRK_ENTRY" }, .name = "ARC_PWR_BRK_ENTRY" },
{ .fc_id = 1323, .cpu_id = 629, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1323, .cpu_id = 629, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_BRK_EXT" }, .name = "ARC_PWR_BRK_EXT" },
{ .fc_id = 1324, .cpu_id = 630, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1324, .cpu_id = 630, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_RD_MODE0" }, .name = "ARC_PWR_RD_MODE0" },
{ .fc_id = 1325, .cpu_id = 631, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1325, .cpu_id = 631, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_RD_MODE1" }, .name = "ARC_PWR_RD_MODE1" },
{ .fc_id = 1326, .cpu_id = 632, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1326, .cpu_id = 632, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_RD_MODE2" }, .name = "ARC_PWR_RD_MODE2" },
{ .fc_id = 1327, .cpu_id = 633, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE, { .fc_id = 1327, .cpu_id = 633, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "PWR_RD_MODE3" }, .name = "ARC_PWR_RD_MODE3" },
{ .fc_id = 1328, .cpu_id = 634, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "ARC_EQ_HEARTBEAT" },
}; };
#endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */ #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
...@@ -33,6 +33,17 @@ ...@@ -33,6 +33,17 @@
#define PLL_MAP_MAX_BITS 128 #define PLL_MAP_MAX_BITS 128
#define PLL_MAP_LEN (PLL_MAP_MAX_BITS / 8) #define PLL_MAP_LEN (PLL_MAP_MAX_BITS / 8)
enum eq_event_id {
EQ_EVENT_NIC_STS_REQUEST = 0,
EQ_EVENT_PWR_MODE_0,
EQ_EVENT_PWR_MODE_1,
EQ_EVENT_PWR_MODE_2,
EQ_EVENT_PWR_MODE_3,
EQ_EVENT_PWR_BRK_ENTRY,
EQ_EVENT_PWR_BRK_EXIT,
EQ_EVENT_HEARTBEAT,
};
/* /*
* info of the pkt queue pointers in the first async occurrence * info of the pkt queue pointers in the first async occurrence
*/ */
...@@ -1143,6 +1154,7 @@ struct cpucp_security_info { ...@@ -1143,6 +1154,7 @@ struct cpucp_security_info {
* (0 = functional 1 = binned) * (0 = functional 1 = binned)
* @interposer_version: Interposer version programmed in eFuse * @interposer_version: Interposer version programmed in eFuse
* @substrate_version: Substrate version programmed in eFuse * @substrate_version: Substrate version programmed in eFuse
* @eq_health_check_supported: eq health check feature supported in FW.
* @fw_hbm_region_size: Size in bytes of FW reserved region in HBM. * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
* @fw_os_version: Firmware OS Version * @fw_os_version: Firmware OS Version
*/ */
...@@ -1169,7 +1181,7 @@ struct cpucp_info { ...@@ -1169,7 +1181,7 @@ struct cpucp_info {
__u8 xbar_binning_mask; __u8 xbar_binning_mask;
__u8 interposer_version; __u8 interposer_version;
__u8 substrate_version; __u8 substrate_version;
__u8 reserved2; __u8 eq_health_check_supported;
struct cpucp_security_info sec_info; struct cpucp_security_info sec_info;
__le32 fw_hbm_region_size; __le32 fw_hbm_region_size;
__u8 pll_map[PLL_MAP_LEN]; __u8 pll_map[PLL_MAP_LEN];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment