Commit a28ce422 authored by Oded Gabbay's avatar Oded Gabbay Committed by Greg Kroah-Hartman

habanalabs: disable CPU access on timeouts

This patch provides a workaround for a bug in the F/W where the response
time for a request from KMD may take more then 100ms. This could cause the
queue between KMD and the F/W to get out of sync.

The WA is to:
1. Increase the timeout of ALL requests to 1s.
2. In case a request isn't answered in time, mark the state as
"cpu_disabled" and prevent sending further requests from KMD to the F/W.
This will eventually lead to a heartbeat failure and hard reset of the
device.
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 27ca384c
...@@ -723,7 +723,7 @@ static ssize_t hl_device_read(struct file *f, char __user *buf, ...@@ -723,7 +723,7 @@ static ssize_t hl_device_read(struct file *f, char __user *buf,
return 0; return 0;
sprintf(tmp_buf, sprintf(tmp_buf,
"Valid values are: disable, enable, suspend, resume\n"); "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf, rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
strlen(tmp_buf) + 1); strlen(tmp_buf) + 1);
...@@ -751,9 +751,11 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf, ...@@ -751,9 +751,11 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
hdev->asic_funcs->suspend(hdev); hdev->asic_funcs->suspend(hdev);
} else if (strncmp("resume", data, strlen("resume")) == 0) { } else if (strncmp("resume", data, strlen("resume")) == 0) {
hdev->asic_funcs->resume(hdev); hdev->asic_funcs->resume(hdev);
} else if (strncmp("cpu_timeout", data, strlen("cpu_timeout")) == 0) {
hdev->device_cpu_disabled = true;
} else { } else {
dev_err(hdev->dev, dev_err(hdev->dev,
"Valid values are: disable, enable, suspend, resume\n"); "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
count = -EINVAL; count = -EINVAL;
} }
......
...@@ -636,6 +636,8 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -636,6 +636,8 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
/* Finished tear-down, starting to re-initialize */ /* Finished tear-down, starting to re-initialize */
if (hard_reset) { if (hard_reset) {
hdev->device_cpu_disabled = false;
/* Allocate the kernel context */ /* Allocate the kernel context */
hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
GFP_KERNEL); GFP_KERNEL);
......
...@@ -3232,6 +3232,11 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, ...@@ -3232,6 +3232,11 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
if (hdev->disabled) if (hdev->disabled)
goto out; goto out;
if (hdev->device_cpu_disabled) {
rc = -EIO;
goto out;
}
rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len, rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
pkt_dma_addr); pkt_dma_addr);
if (rc) { if (rc) {
...@@ -3245,8 +3250,8 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, ...@@ -3245,8 +3250,8 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ); hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);
if (rc == -ETIMEDOUT) { if (rc == -ETIMEDOUT) {
dev_err(hdev->dev, dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
"Timeout while waiting for CPU packet fence\n"); hdev->device_cpu_disabled = true;
goto out; goto out;
} }
......
...@@ -1079,6 +1079,7 @@ struct hl_device_reset_work { ...@@ -1079,6 +1079,7 @@ struct hl_device_reset_work {
* @dram_default_page_mapping: is DRAM default page mapping enabled. * @dram_default_page_mapping: is DRAM default page mapping enabled.
* @init_done: is the initialization of the device done. * @init_done: is the initialization of the device done.
* @mmu_enable: is MMU enabled. * @mmu_enable: is MMU enabled.
* @device_cpu_disabled: is the device CPU disabled (due to timeouts)
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -1146,6 +1147,7 @@ struct hl_device { ...@@ -1146,6 +1147,7 @@ struct hl_device {
u8 dram_supports_virtual_memory; u8 dram_supports_virtual_memory;
u8 dram_default_page_mapping; u8 dram_default_page_mapping;
u8 init_done; u8 init_done;
u8 device_cpu_disabled;
/* Parameters for bring-up */ /* Parameters for bring-up */
u8 mmu_enable; u8 mmu_enable;
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/hwmon.h> #include <linux/hwmon.h>
#define SENSORS_PKT_TIMEOUT 100000 /* 100ms */ #define SENSORS_PKT_TIMEOUT 1000000 /* 1s */
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1) #define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)
int hl_build_hwmon_channel_info(struct hl_device *hdev, int hl_build_hwmon_channel_info(struct hl_device *hdev,
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include <linux/pci.h> #include <linux/pci.h>
#define SET_CLK_PKT_TIMEOUT 200000 /* 200ms */ #define SET_CLK_PKT_TIMEOUT 1000000 /* 1s */
#define SET_PWR_PKT_TIMEOUT 400000 /* 400ms */ #define SET_PWR_PKT_TIMEOUT 1000000 /* 1s */
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr) long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment