Commit 73ea648d authored by Shaoyun Liu's avatar Shaoyun Liu Committed by Oded Gabbay

drm/amdkfd: Implement hang detection in KFD and call amdgpu

The reset will be performed in a new hw_exception work thread to
handle HWS hang without blocking the thread that detected the hang.
Signed-off-by: default avatarShaoyun Liu <Shaoyun.Liu@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 24da5a9c
...@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, ...@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
static void deallocate_sdma_queue(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm,
unsigned int sdma_queue_id); unsigned int sdma_queue_id);
static void kfd_process_hw_exception(struct work_struct *work);
static inline static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{ {
...@@ -1010,6 +1012,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm) ...@@ -1010,6 +1012,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
dqm->active_runlist = false; dqm->active_runlist = false;
dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
return 0; return 0;
} }
...@@ -1042,6 +1046,8 @@ static int start_cpsch(struct device_queue_manager *dqm) ...@@ -1042,6 +1046,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
init_interrupts(dqm); init_interrupts(dqm);
dqm_lock(dqm); dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm); dqm_unlock(dqm);
...@@ -1255,6 +1261,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, ...@@ -1255,6 +1261,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
{ {
int retval = 0; int retval = 0;
if (dqm->is_hws_hang)
return -EIO;
if (!dqm->active_runlist) if (!dqm->active_runlist)
return retval; return retval;
...@@ -1293,9 +1301,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, ...@@ -1293,9 +1301,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
{ {
int retval; int retval;
if (dqm->is_hws_hang)
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param); retval = unmap_queues_cpsch(dqm, filter, filter_param);
if (retval) { if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
dqm->is_hws_hang = true;
schedule_work(&dqm->hw_exception_work);
return retval; return retval;
} }
...@@ -1543,7 +1555,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, ...@@ -1543,7 +1555,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
} }
retval = execute_queues_cpsch(dqm, filter, 0); retval = execute_queues_cpsch(dqm, filter, 0);
if (retval || qpd->reset_wavefronts) { if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
qpd->reset_wavefronts = false; qpd->reset_wavefronts = false;
...@@ -1701,6 +1713,13 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm, ...@@ -1701,6 +1713,13 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
return ret; return ret;
} }
static void kfd_process_hw_exception(struct work_struct *work)
{
struct device_queue_manager *dqm = container_of(work,
struct device_queue_manager, hw_exception_work);
dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
}
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m, static void seq_reg_dump(struct seq_file *m,
......
...@@ -193,6 +193,10 @@ struct device_queue_manager { ...@@ -193,6 +193,10 @@ struct device_queue_manager {
struct kfd_mem_obj *fence_mem; struct kfd_mem_obj *fence_mem;
bool active_runlist; bool active_runlist;
int sched_policy; int sched_policy;
/* hw exception */
bool is_hws_hang;
struct work_struct hw_exception_work;
}; };
void device_queue_manager_init_cik( void device_queue_manager_init_cik(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment