Commit 09c34e8d authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdkfd: Improve HWS hang detection and handling

Move HWS hang detection into unmap_queues_cpsch to catch hangs in all
cases. If this happens during a reset, don't schedule another reset
because the reset already in progress is expected to take care of it.
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Tested-by: default avatarEmily Deng <Emily.Deng@amd.com>
Reviewed-by: default avatarshaoyunl  <shaoyun.liu@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 63e088ac
...@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) ...@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{ {
if (!kfd->init_complete) if (!kfd->init_complete)
return 0; return 0;
kfd->dqm->ops.pre_reset(kfd->dqm);
kgd2kfd_suspend(kfd); kgd2kfd_suspend(kfd);
kfd_signal_reset_event(kfd); kfd_signal_reset_event(kfd);
......
...@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) ...@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
return 0; return 0;
} }
static void pre_reset(struct device_queue_manager *dqm)
{
dqm_lock(dqm);
dqm->is_resetting = true;
dqm_unlock(dqm);
}
static int allocate_sdma_queue(struct device_queue_manager *dqm, static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q) struct queue *q)
{ {
...@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm) ...@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm_lock(dqm); dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */ /* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false; dqm->is_hws_hang = false;
dqm->is_resetting = false;
dqm->sched_running = true; dqm->sched_running = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm); dqm_unlock(dqm);
...@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, ...@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
/* should be timed out */ /* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
queue_preemption_timeout_ms); queue_preemption_timeout_ms);
if (retval) if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
dqm->is_hws_hang = true;
/* It's possible we're detecting a HWS hang in the
* middle of a GPU reset. No need to schedule another
* reset in this case.
*/
if (!dqm->is_resetting)
schedule_work(&dqm->hw_exception_work);
return retval; return retval;
}
pm_release_ib(&dqm->packets); pm_release_ib(&dqm->packets);
dqm->active_runlist = false; dqm->active_runlist = false;
...@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, ...@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
if (dqm->is_hws_hang) if (dqm->is_hws_hang)
return -EIO; return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param); retval = unmap_queues_cpsch(dqm, filter, filter_param);
if (retval) { if (retval)
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
dqm->is_hws_hang = true;
schedule_work(&dqm->hw_exception_work);
return retval; return retval;
}
return map_queues_cpsch(dqm); return map_queues_cpsch(dqm);
} }
...@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) ...@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.initialize = initialize_cpsch; dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch; dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch; dqm->ops.stop = stop_cpsch;
dqm->ops.pre_reset = pre_reset;
dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue; dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process; dqm->ops.register_process = register_process;
...@@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) ...@@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
/* initialize dqm for no cp scheduling */ /* initialize dqm for no cp scheduling */
dqm->ops.start = start_nocpsch; dqm->ops.start = start_nocpsch;
dqm->ops.stop = stop_nocpsch; dqm->ops.stop = stop_nocpsch;
dqm->ops.pre_reset = pre_reset;
dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue; dqm->ops.update_queue = update_queue;
......
...@@ -104,6 +104,7 @@ struct device_queue_manager_ops { ...@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
int (*initialize)(struct device_queue_manager *dqm); int (*initialize)(struct device_queue_manager *dqm);
int (*start)(struct device_queue_manager *dqm); int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm); int (*stop)(struct device_queue_manager *dqm);
void (*pre_reset)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm); void (*uninitialize)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm, int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq, struct kernel_queue *kq,
...@@ -198,6 +199,7 @@ struct device_queue_manager { ...@@ -198,6 +199,7 @@ struct device_queue_manager {
/* hw exception */ /* hw exception */
bool is_hws_hang; bool is_hws_hang;
bool is_resetting;
struct work_struct hw_exception_work; struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd; struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running; bool sched_running;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment