Commit 76cedc73 authored by Omer Shpigelman's avatar Omer Shpigelman Committed by Oded Gabbay

habanalabs: remove stop-on-error flag from DMA

Stop-on-error mode in DMA is useful as it stops the transaction
immediately upon error e.g. page fault.
But it may cause the next command submission to fail as is leaves the DMA
in unstable state.
Therefore we remove the stop-on-error configuration from the DMA.
Stop-on-err is still available for debug.
Signed-off-by: default avatarOmer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 3ec499c9
...@@ -150,3 +150,10 @@ KernelVersion: 5.1 ...@@ -150,3 +150,10 @@ KernelVersion: 5.1
Contact: oded.gabbay@gmail.com Contact: oded.gabbay@gmail.com
Description: Displays a list with information about all the active virtual Description: Displays a list with information about all the active virtual
address mappings per ASID address mappings per ASID
What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
Date: Mar 2020
KernelVersion: 5.6
Contact: oded.gabbay@gmail.com
Description: Sets the stop-on_error option for the device engines. Value of
"0" is for disable, otherwise enable.
...@@ -970,6 +970,49 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf, ...@@ -970,6 +970,49 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
return count; return count;
} }
static ssize_t hl_stop_on_err_read(struct file *f, char __user *buf,
size_t count, loff_t *ppos)
{
struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
struct hl_device *hdev = entry->hdev;
char tmp_buf[200];
ssize_t rc;
if (*ppos)
return 0;
sprintf(tmp_buf, "%d\n", hdev->stop_on_err);
rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
strlen(tmp_buf) + 1);
return rc;
}
static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
size_t count, loff_t *ppos)
{
struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
struct hl_device *hdev = entry->hdev;
u32 value;
ssize_t rc;
if (atomic_read(&hdev->in_reset)) {
dev_warn_ratelimited(hdev->dev,
"Can't change stop on error during reset\n");
return 0;
}
rc = kstrtouint_from_user(buf, count, 10, &value);
if (rc)
return rc;
hdev->stop_on_err = value ? 1 : 0;
hl_device_reset(hdev, false, false);
return count;
}
static const struct file_operations hl_data32b_fops = { static const struct file_operations hl_data32b_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.read = hl_data_read32, .read = hl_data_read32,
...@@ -1015,6 +1058,12 @@ static const struct file_operations hl_device_fops = { ...@@ -1015,6 +1058,12 @@ static const struct file_operations hl_device_fops = {
.write = hl_device_write .write = hl_device_write
}; };
static const struct file_operations hl_stop_on_err_fops = {
.owner = THIS_MODULE,
.read = hl_stop_on_err_read,
.write = hl_stop_on_err_write
};
static const struct hl_info_list hl_debugfs_list[] = { static const struct hl_info_list hl_debugfs_list[] = {
{"command_buffers", command_buffers_show, NULL}, {"command_buffers", command_buffers_show, NULL},
{"command_submission", command_submission_show, NULL}, {"command_submission", command_submission_show, NULL},
...@@ -1152,6 +1201,12 @@ void hl_debugfs_add_device(struct hl_device *hdev) ...@@ -1152,6 +1201,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry, dev_entry,
&hl_device_fops); &hl_device_fops);
debugfs_create_file("stop_on_err",
0644,
dev_entry->root,
dev_entry,
&hl_stop_on_err_fops);
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
ent = debugfs_create_file(hl_debugfs_list[i].name, ent = debugfs_create_file(hl_debugfs_list[i].name,
......
...@@ -800,6 +800,7 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id, ...@@ -800,6 +800,7 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
u32 so_base_lo, so_base_hi; u32 so_base_lo, so_base_hi;
u32 gic_base_lo, gic_base_hi; u32 gic_base_lo, gic_base_hi;
u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI); u32 reg_off = dma_id * (mmDMA_QM_1_PQ_PI - mmDMA_QM_0_PQ_PI);
u32 dma_err_cfg = QMAN_DMA_ERR_MSG_EN;
mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
...@@ -836,7 +837,10 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id, ...@@ -836,7 +837,10 @@ static void goya_init_dma_qman(struct hl_device *hdev, int dma_id,
else else
WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED); WREG32(mmDMA_QM_0_GLBL_PROT + reg_off, QMAN_DMA_FULLY_TRUSTED);
WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, QMAN_DMA_ERR_MSG_EN); if (hdev->stop_on_err)
dma_err_cfg |= 1 << DMA_QM_0_GLBL_ERR_CFG_DMA_STOP_ON_ERR_SHIFT;
WREG32(mmDMA_QM_0_GLBL_ERR_CFG + reg_off, dma_err_cfg);
WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE); WREG32(mmDMA_QM_0_GLBL_CFG0 + reg_off, QMAN_DMA_ENABLE);
} }
......
...@@ -1300,6 +1300,7 @@ struct hl_device_idle_busy_ts { ...@@ -1300,6 +1300,7 @@ struct hl_device_idle_busy_ts {
* @in_debug: is device under debug. This, together with fpriv_list, enforces * @in_debug: is device under debug. This, together with fpriv_list, enforces
* that only a single user is configuring the debug infrastructure. * that only a single user is configuring the debug infrastructure.
* @cdev_sysfs_created: were char devices and sysfs nodes created. * @cdev_sysfs_created: were char devices and sysfs nodes created.
* @stop_on_err: true if engines should stop on error.
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -1380,6 +1381,7 @@ struct hl_device { ...@@ -1380,6 +1381,7 @@ struct hl_device {
u8 dma_mask; u8 dma_mask;
u8 in_debug; u8 in_debug;
u8 cdev_sysfs_created; u8 cdev_sysfs_created;
u8 stop_on_err;
/* Parameters for bring-up */ /* Parameters for bring-up */
u8 mmu_enable; u8 mmu_enable;
......
...@@ -55,8 +55,7 @@ ...@@ -55,8 +55,7 @@
(1 << DMA_QM_0_GLBL_ERR_CFG_DMA_ERR_MSG_EN_SHIFT) | \ (1 << DMA_QM_0_GLBL_ERR_CFG_DMA_ERR_MSG_EN_SHIFT) | \
(1 << DMA_QM_0_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \ (1 << DMA_QM_0_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
(1 << DMA_QM_0_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \ (1 << DMA_QM_0_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
(1 << DMA_QM_0_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT) | \ (1 << DMA_QM_0_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
(1 << DMA_QM_0_GLBL_ERR_CFG_DMA_STOP_ON_ERR_SHIFT))
#define QMAN_MME_ENABLE (\ #define QMAN_MME_ENABLE (\
(1 << MME_QM_GLBL_CFG0_PQF_EN_SHIFT) | \ (1 << MME_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment