Commit d8b9cea5 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

accel/habanalabs: add pci health check during heartbeat

Currently upon a heartbeat failure, we don't know if the failure
is due to firmware hang or due to a bad PCI link. Hence, we
are reading a PCI config space register with a known value (vendor ID)
so we will know which of the two possibilities caused the heartbeat
failure.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 3d21ec64
...@@ -981,6 +981,18 @@ static void device_early_fini(struct hl_device *hdev) ...@@ -981,6 +981,18 @@ static void device_early_fini(struct hl_device *hdev)
hdev->asic_funcs->early_fini(hdev); hdev->asic_funcs->early_fini(hdev);
} }
static bool is_pci_link_healthy(struct hl_device *hdev)
{
u16 vendor_id;
if (!hdev->pdev)
return false;
pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
}
static void hl_device_heartbeat(struct work_struct *work) static void hl_device_heartbeat(struct work_struct *work)
{ {
struct hl_device *hdev = container_of(work, struct hl_device, struct hl_device *hdev = container_of(work, struct hl_device,
...@@ -995,7 +1007,8 @@ static void hl_device_heartbeat(struct work_struct *work) ...@@ -995,7 +1007,8 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule; goto reschedule;
if (hl_device_operational(hdev, NULL)) if (hl_device_operational(hdev, NULL))
dev_err(hdev->dev, "Device heartbeat failed!\n"); dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
is_pci_link_healthy(hdev) ? "healthy" : "broken");
info.err_type = HL_INFO_FW_HEARTBEAT_ERR; info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
info.event_mask = &event_mask; info.event_mask = &event_mask;
......
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
struct hl_device; struct hl_device;
struct hl_fpriv; struct hl_fpriv;
#define PCI_VENDOR_ID_HABANALABS 0x1da3
/* Use upper bits of mmap offset to store habana driver specific information. /* Use upper bits of mmap offset to store habana driver specific information.
* bits[63:59] - Encode mmap type * bits[63:59] - Encode mmap type
* bits[45:0] - mmap offset value * bits[45:0] - mmap offset value
......
...@@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444); ...@@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
MODULE_PARM_DESC(boot_error_status_mask, MODULE_PARM_DESC(boot_error_status_mask,
"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)"); "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
#define PCI_VENDOR_ID_HABANALABS 0x1da3
#define PCI_IDS_GOYA 0x0001 #define PCI_IDS_GOYA 0x0001
#define PCI_IDS_GAUDI 0x1000 #define PCI_IDS_GAUDI 0x1000
#define PCI_IDS_GAUDI_SEC 0x1010 #define PCI_IDS_GAUDI_SEC 0x1010
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment