Commit e42a6400 authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay

habanalabs: skip DISABLE PCI packet to FW on heartbeat

if reset is due to heartbeat, device CPU is no responsive in which
case no point sending PCI disable message to it.
Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent d5eb8373
...@@ -620,7 +620,7 @@ static void cs_timedout(struct work_struct *work) ...@@ -620,7 +620,7 @@ static void cs_timedout(struct work_struct *work)
cs_put(cs); cs_put(cs);
if (hdev->reset_on_lockup) if (hdev->reset_on_lockup)
hl_device_reset(hdev, false, false); hl_device_reset(hdev, 0);
else else
hdev->needs_reset = true; hdev->needs_reset = true;
} }
...@@ -1473,7 +1473,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, ...@@ -1473,7 +1473,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
out: out:
if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset)) if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
hl_device_reset(hdev, false, false); hl_device_reset(hdev, 0);
return rc; return rc;
} }
......
...@@ -887,7 +887,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf, ...@@ -887,7 +887,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
hdev->stop_on_err = value ? 1 : 0; hdev->stop_on_err = value ? 1 : 0;
hl_device_reset(hdev, false, false); hl_device_reset(hdev, 0);
return count; return count;
} }
......
...@@ -72,7 +72,7 @@ static void hpriv_release(struct kref *ref) ...@@ -72,7 +72,7 @@ static void hpriv_release(struct kref *ref)
kfree(hpriv); kfree(hpriv);
if (hdev->reset_upon_device_release) if (hdev->reset_upon_device_release)
hl_device_reset(hdev, false, false); hl_device_reset(hdev, 0);
} }
void hl_hpriv_get(struct hl_fpriv *hpriv) void hl_hpriv_get(struct hl_fpriv *hpriv)
...@@ -293,7 +293,7 @@ static void device_hard_reset_pending(struct work_struct *work) ...@@ -293,7 +293,7 @@ static void device_hard_reset_pending(struct work_struct *work)
struct hl_device *hdev = device_reset_work->hdev; struct hl_device *hdev = device_reset_work->hdev;
int rc; int rc;
rc = hl_device_reset(hdev, true, true); rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
if ((rc == -EBUSY) && !hdev->device_fini_pending) { if ((rc == -EBUSY) && !hdev->device_fini_pending) {
dev_info(hdev->dev, dev_info(hdev->dev,
"Could not reset device. will try again in %u seconds", "Could not reset device. will try again in %u seconds",
...@@ -495,7 +495,7 @@ static void hl_device_heartbeat(struct work_struct *work) ...@@ -495,7 +495,7 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule; goto reschedule;
dev_err(hdev->dev, "Device heartbeat failed!\n"); dev_err(hdev->dev, "Device heartbeat failed!\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_HEARTBEAT);
return; return;
...@@ -819,7 +819,7 @@ int hl_device_resume(struct hl_device *hdev) ...@@ -819,7 +819,7 @@ int hl_device_resume(struct hl_device *hdev)
hdev->disabled = false; hdev->disabled = false;
atomic_set(&hdev->in_reset, 0); atomic_set(&hdev->in_reset, 0);
rc = hl_device_reset(hdev, true, false); rc = hl_device_reset(hdev, HL_RESET_HARD);
if (rc) { if (rc) {
dev_err(hdev->dev, "Failed to reset device during resume\n"); dev_err(hdev->dev, "Failed to reset device during resume\n");
goto disable_device; goto disable_device;
...@@ -925,9 +925,7 @@ static void device_disable_open_processes(struct hl_device *hdev) ...@@ -925,9 +925,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
* hl_device_reset - reset the device * hl_device_reset - reset the device
* *
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @hard_reset: should we do hard reset to all engines or just reset the * @flags: reset flags.
* compute/dma engines
* @from_hard_reset_thread: is the caller the hard-reset thread
* *
* Block future CS and wait for pending CS to be enqueued * Block future CS and wait for pending CS to be enqueued
* Call ASIC H/W fini * Call ASIC H/W fini
...@@ -939,10 +937,10 @@ static void device_disable_open_processes(struct hl_device *hdev) ...@@ -939,10 +937,10 @@ static void device_disable_open_processes(struct hl_device *hdev)
* *
* Returns 0 for success or an error on failure. * Returns 0 for success or an error on failure.
*/ */
int hl_device_reset(struct hl_device *hdev, bool hard_reset, int hl_device_reset(struct hl_device *hdev, u32 flags)
bool from_hard_reset_thread)
{ {
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
bool hard_reset, from_hard_reset_thread;
int i, rc; int i, rc;
if (!hdev->init_done) { if (!hdev->init_done) {
...@@ -951,6 +949,9 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -951,6 +949,9 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
return 0; return 0;
} }
hard_reset = (flags & HL_RESET_HARD) != 0;
from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
if ((!hard_reset) && (!hdev->supports_soft_reset)) { if ((!hard_reset) && (!hdev->supports_soft_reset)) {
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n"); dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
hard_reset = true; hard_reset = true;
...@@ -971,7 +972,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -971,7 +972,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
if (rc) if (rc)
return 0; return 0;
if (hard_reset) { /*
* if reset is due to heartbeat, device CPU is no responsive in
* which case no point sending PCI disable message to it
*/
if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
/* Disable PCI access from device F/W so he won't send /* Disable PCI access from device F/W so he won't send
* us additional interrupts. We disable MSI/MSI-X at * us additional interrupts. We disable MSI/MSI-X at
* the halt_engines function and we can't have the F/W * the halt_engines function and we can't have the F/W
......
...@@ -104,6 +104,23 @@ enum hl_mmu_page_table_location { ...@@ -104,6 +104,23 @@ enum hl_mmu_page_table_location {
#define HL_MAX_DCORES 4 #define HL_MAX_DCORES 4
/*
* Reset Flags
*
* - HL_RESET_HARD
* If set do hard reset to all engines. If not set reset just
* compute/DMA engines.
*
* - HL_RESET_FROM_RESET_THREAD
* Set if the caller is the hard-reset thread
*
* - HL_RESET_HEARTBEAT
* Set if reset is due to heartbeat
*/
#define HL_RESET_HARD (1 << 0)
#define HL_RESET_FROM_RESET_THREAD (1 << 1)
#define HL_RESET_HEARTBEAT (1 << 2)
#define HL_MAX_SOBS_PER_MONITOR 8 #define HL_MAX_SOBS_PER_MONITOR 8
/** /**
...@@ -2242,8 +2259,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass); ...@@ -2242,8 +2259,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass);
void hl_device_fini(struct hl_device *hdev); void hl_device_fini(struct hl_device *hdev);
int hl_device_suspend(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev);
int hl_device_resume(struct hl_device *hdev); int hl_device_resume(struct hl_device *hdev);
int hl_device_reset(struct hl_device *hdev, bool hard_reset, int hl_device_reset(struct hl_device *hdev, u32 flags);
bool from_hard_reset_thread);
void hl_hpriv_get(struct hl_fpriv *hpriv); void hl_hpriv_get(struct hl_fpriv *hpriv);
int hl_hpriv_put(struct hl_fpriv *hpriv); int hl_hpriv_put(struct hl_fpriv *hpriv);
int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq); int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
......
...@@ -203,7 +203,7 @@ static ssize_t soft_reset_store(struct device *dev, ...@@ -203,7 +203,7 @@ static ssize_t soft_reset_store(struct device *dev,
dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n"); dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
hl_device_reset(hdev, false, false); hl_device_reset(hdev, 0);
out: out:
return count; return count;
...@@ -226,7 +226,7 @@ static ssize_t hard_reset_store(struct device *dev, ...@@ -226,7 +226,7 @@ static ssize_t hard_reset_store(struct device *dev,
dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n"); dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
out: out:
return count; return count;
......
...@@ -7380,18 +7380,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7380,18 +7380,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_MMU_DERR: case GAUDI_EVENT_MMU_DERR:
gaudi_print_irq_info(hdev, event_type, true); gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_GIC500: case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false); gaudi_print_irq_info(hdev, event_type, false);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_HBM0_SPI_0: case GAUDI_EVENT_HBM0_SPI_0:
case GAUDI_EVENT_HBM1_SPI_0: case GAUDI_EVENT_HBM1_SPI_0:
...@@ -7401,9 +7397,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7401,9 +7397,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
gaudi_hbm_read_interrupts(hdev, gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type), gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data); &eq_entry->hbm_ecc_data);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_HBM0_SPI_1: case GAUDI_EVENT_HBM0_SPI_1:
case GAUDI_EVENT_HBM1_SPI_1: case GAUDI_EVENT_HBM1_SPI_1:
...@@ -7432,8 +7426,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7432,8 +7426,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
dev_err(hdev->dev, "hard reset required due to %s\n", dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name); gaudi_irq_map_table[event_type].name);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
} else { } else {
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
} }
...@@ -7455,8 +7448,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7455,8 +7448,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
dev_err(hdev->dev, "hard reset required due to %s\n", dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name); gaudi_irq_map_table[event_type].name);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
} else { } else {
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
} }
...@@ -7525,9 +7517,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7525,9 +7517,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_RAZWI_OR_ADC_SW: case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true); gaudi_print_irq_info(hdev, event_type, true);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_TPC0_BMON_SPMU: case GAUDI_EVENT_TPC0_BMON_SPMU:
case GAUDI_EVENT_TPC1_BMON_SPMU: case GAUDI_EVENT_TPC1_BMON_SPMU:
...@@ -7564,17 +7554,21 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7564,17 +7554,21 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
gaudi_print_irq_info(hdev, event_type, false); gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
if (hdev->hard_reset_on_fw_events) goto reset_device;
hl_device_reset(hdev, true, false);
else
hl_fw_unmask_irq(hdev, event_type);
break;
default: default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n", dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type); event_type);
break; break;
} }
return;
reset_device:
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, HL_RESET_HARD);
else
hl_fw_unmask_irq(hdev, event_type);
} }
static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate, static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
...@@ -7625,7 +7619,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, ...@@ -7625,7 +7619,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (rc) { if (rc) {
dev_err_ratelimited(hdev->dev, dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n"); "MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
} }
return rc; return rc;
...@@ -7674,7 +7668,7 @@ static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, ...@@ -7674,7 +7668,7 @@ static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
if (rc) { if (rc) {
dev_err_ratelimited(hdev->dev, dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n"); "MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
} }
return rc; return rc;
......
...@@ -4712,7 +4712,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) ...@@ -4712,7 +4712,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET: case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
goya_print_irq_info(hdev, event_type, false); goya_print_irq_info(hdev, event_type, false);
if (hdev->hard_reset_on_fw_events) if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
break; break;
case GOYA_ASYNC_EVENT_ID_PCIE_DEC: case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
...@@ -4772,7 +4772,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) ...@@ -4772,7 +4772,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
goya_print_irq_info(hdev, event_type, false); goya_print_irq_info(hdev, event_type, false);
goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
if (hdev->hard_reset_on_fw_events) if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
else else
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
break; break;
...@@ -5106,7 +5106,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, ...@@ -5106,7 +5106,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (rc) { if (rc) {
dev_err_ratelimited(hdev->dev, dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n"); "MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
} }
return rc; return rc;
...@@ -5157,7 +5157,7 @@ static int goya_mmu_invalidate_cache_range(struct hl_device *hdev, ...@@ -5157,7 +5157,7 @@ static int goya_mmu_invalidate_cache_range(struct hl_device *hdev,
if (rc) { if (rc) {
dev_err_ratelimited(hdev->dev, dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n"); "MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false); hl_device_reset(hdev, HL_RESET_HARD);
} }
return rc; return rc;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment