drm/amdgpu: add timer to fence to detect scheduler lockup

Change-Id: I67e987db0efdca28faa80b332b75571192130d33 Signed-off-by: Junwei Zhang <Jerry.Zhang@amd.com> Reviewed-by: David Zhou <david1.zhou@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com>

drm/amdgpu: add timer to fence to detect scheduler lockup
Change-Id: I67e987db0efdca28faa80b332b75571192130d33 Signed-off-by: Junwei Zhang <Jerry.Zhang@amd.com> Reviewed-by: David Zhou <david1.zhou@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com>
2440ff2c · Junwei Zhang · Alex Deucher · d6c10f6b · 2440ff2c · 2440ff2c
Commit 2440ff2c authored Oct 10, 2015 by Junwei Zhang Committed by Alex Deucher Oct 14, 2015
3 changed files
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -628,8 +628,20 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
 	init_waitqueue_head(&ring->fence_drv.fence_queue);

 	if (amdgpu_enable_scheduler) {
+		long timeout = msecs_to_jiffies(amdgpu_lockup_timeout);
+		if (timeout == 0) {
+			/*
+			 * FIXME:
+			 * Delayed workqueue cannot use it directly,
+			 * so the scheduler will not use delayed workqueue if
+			 * MAX_SCHEDULE_TIMEOUT is set.
+			 * Currently keep it simple and silly.
+			 */
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		}
 		r = amd_sched_init(&ring->sched, &amdgpu_sched_ops,
-				   amdgpu_sched_hw_submission, ring->name);
+				   amdgpu_sched_hw_submission,
+				   timeout, ring->name);
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);

--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -327,19 +327,49 @@ static void amd_sched_process_job(struct fence *f, struct fence_cb *cb)
 	struct amd_sched_fence *s_fence =
 		container_of(cb, struct amd_sched_fence, cb);
 	struct amd_gpu_scheduler *sched = s_fence->sched;
+	unsigned long flags;

 	atomic_dec(&sched->hw_rq_count);
 	amd_sched_fence_signal(s_fence);
+	if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
+		cancel_delayed_work_sync(&s_fence->dwork);
+		spin_lock_irqsave(&sched->fence_list_lock, flags);
+		list_del_init(&s_fence->list);
+		spin_unlock_irqrestore(&sched->fence_list_lock, flags);
+	}
 	fence_put(&s_fence->base);
 	wake_up_interruptible(&sched->wake_up_worker);
 }

+static void amd_sched_fence_work_func(struct work_struct *work)
+{
+	struct amd_sched_fence *s_fence =
+		container_of(work, struct amd_sched_fence, dwork.work);
+	struct amd_gpu_scheduler *sched = s_fence->sched;
+	struct amd_sched_fence *entity, *tmp;
+	unsigned long flags;
+
+	DRM_ERROR("[%s] scheduler is timeout!\n", sched->name);
+
+	/* Clean all pending fences */
+	list_for_each_entry_safe(entity, tmp, &sched->fence_list, list) {
+		DRM_ERROR("  fence no %d\n", entity->base.seqno);
+		cancel_delayed_work_sync(&entity->dwork);
+		spin_lock_irqsave(&sched->fence_list_lock, flags);
+		list_del_init(&entity->list);
+		spin_unlock_irqrestore(&sched->fence_list_lock, flags);
+		fence_put(&entity->base);
+	}
+}
+
 static int amd_sched_main(void *param)
 {
 	struct sched_param sparam = {.sched_priority = 1};
 	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
 	int r, count;

+	spin_lock_init(&sched->fence_list_lock);
+	INIT_LIST_HEAD(&sched->fence_list);
 	sched_setscheduler(current, SCHED_FIFO, &sparam);

 	while (!kthread_should_stop()) {
@@ -347,6 +377,7 @@ static int amd_sched_main(void *param)
 		struct amd_sched_fence *s_fence;
 		struct amd_sched_job *sched_job;
 		struct fence *fence;
+		unsigned long flags;

 		wait_event_interruptible(sched->wake_up_worker,
 			kthread_should_stop() ||
@@ -357,6 +388,15 @@ static int amd_sched_main(void *param)

 		entity = sched_job->s_entity;
 		s_fence = sched_job->s_fence;
+
+		if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
+			INIT_DELAYED_WORK(&s_fence->dwork, amd_sched_fence_work_func);
+			schedule_delayed_work(&s_fence->dwork, sched->timeout);
+			spin_lock_irqsave(&sched->fence_list_lock, flags);
+			list_add_tail(&s_fence->list, &sched->fence_list);
+			spin_unlock_irqrestore(&sched->fence_list_lock, flags);
+		}
+
 		atomic_inc(&sched->hw_rq_count);
 		fence = sched->ops->run_job(sched_job);
 		if (fence) {
@@ -392,11 +432,12 @@ static int amd_sched_main(void *param)
 */
 int amd_sched_init(struct amd_gpu_scheduler *sched,
 		   struct amd_sched_backend_ops *ops,
-		   unsigned hw_submission, const char *name)
+		   unsigned hw_submission, long timeout, const char *name)
 {
 	sched->ops = ops;
 	sched->hw_submission_limit = hw_submission;
 	sched->name = name;
+	sched->timeout = timeout;
 	amd_sched_rq_init(&sched->sched_rq);
 	amd_sched_rq_init(&sched->kernel_rq);


--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -68,6 +68,8 @@ struct amd_sched_fence {
 	struct amd_gpu_scheduler	*sched;
 	spinlock_t			lock;
 	void                            *owner;
+	struct delayed_work		dwork;
+	struct list_head		list;
 };

 struct amd_sched_job {
@@ -103,18 +105,21 @@ struct amd_sched_backend_ops {
 struct amd_gpu_scheduler {
 	struct amd_sched_backend_ops	*ops;
 	uint32_t			hw_submission_limit;
+	long				timeout;
 	const char			*name;
 	struct amd_sched_rq		sched_rq;
 	struct amd_sched_rq		kernel_rq;
 	wait_queue_head_t		wake_up_worker;
 	wait_queue_head_t		job_scheduled;
 	atomic_t			hw_rq_count;
+	struct list_head		fence_list;
+	spinlock_t			fence_list_lock;
 	struct task_struct		*thread;
 };

 int amd_sched_init(struct amd_gpu_scheduler *sched,
 		   struct amd_sched_backend_ops *ops,
-		   uint32_t hw_submission, const char *name);
+		   uint32_t hw_submission, long timeout, const char *name);
 void amd_sched_fini(struct amd_gpu_scheduler *sched);

 int amd_sched_entity_init(struct amd_gpu_scheduler *sched,