Commit b6723c8d authored by Monk Liu's avatar Monk Liu Committed by Alex Deucher

drm/amdgpu: use ref to keep job alive

this is to fix fatal page fault error that occured if:
job is signaled/released after its timeout work is already
put to the global queue (in this case the cancel_delayed_work
will return false), which will lead to NX-protection error
page fault during job_timeout_func.
Signed-off-by: default avatarMonk Liu <Monk.Liu@amd.com>
Reviewed-by: default avatarChunming Zhou <david1.zhou@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0de2479c
...@@ -750,7 +750,9 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, ...@@ -750,7 +750,9 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
struct amdgpu_job **job); struct amdgpu_job **job);
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
struct amdgpu_job **job); struct amdgpu_job **job);
void amdgpu_job_free(struct amdgpu_job *job); void amdgpu_job_free(struct amdgpu_job *job);
void amdgpu_job_free_func(struct kref *refcount);
int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
struct amd_sched_entity *entity, void *owner, struct amd_sched_entity *entity, void *owner,
struct fence **f); struct fence **f);
......
...@@ -872,6 +872,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, ...@@ -872,6 +872,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
r = amd_sched_job_init(&job->base, &ring->sched, r = amd_sched_job_init(&job->base, &ring->sched,
&p->ctx->rings[ring->idx].entity, &p->ctx->rings[ring->idx].entity,
amdgpu_job_timeout_func, amdgpu_job_timeout_func,
amdgpu_job_free_func,
p->filp, &fence); p->filp, &fence);
if (r) { if (r) {
amdgpu_job_free(job); amdgpu_job_free(job);
......
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
static void amdgpu_job_free_handler(struct work_struct *ws) static void amdgpu_job_free_handler(struct work_struct *ws)
{ {
struct amdgpu_job *job = container_of(ws, struct amdgpu_job, base.work_free_job); struct amdgpu_job *job = container_of(ws, struct amdgpu_job, base.work_free_job);
kfree(job); amd_sched_job_put(&job->base);
} }
void amdgpu_job_timeout_func(struct work_struct *work) void amdgpu_job_timeout_func(struct work_struct *work)
...@@ -41,6 +41,8 @@ void amdgpu_job_timeout_func(struct work_struct *work) ...@@ -41,6 +41,8 @@ void amdgpu_job_timeout_func(struct work_struct *work)
job->base.sched->name, job->base.sched->name,
(uint32_t)atomic_read(&job->ring->fence_drv.last_seq), (uint32_t)atomic_read(&job->ring->fence_drv.last_seq),
job->ring->fence_drv.sync_seq); job->ring->fence_drv.sync_seq);
amd_sched_job_put(&job->base);
} }
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
...@@ -101,6 +103,12 @@ void amdgpu_job_free(struct amdgpu_job *job) ...@@ -101,6 +103,12 @@ void amdgpu_job_free(struct amdgpu_job *job)
kfree(job); kfree(job);
} }
void amdgpu_job_free_func(struct kref *refcount)
{
struct amdgpu_job *job = container_of(refcount, struct amdgpu_job, base.refcount);
kfree(job);
}
int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
struct amd_sched_entity *entity, void *owner, struct amd_sched_entity *entity, void *owner,
struct fence **f) struct fence **f)
...@@ -113,9 +121,10 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, ...@@ -113,9 +121,10 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
return -EINVAL; return -EINVAL;
r = amd_sched_job_init(&job->base, &ring->sched, r = amd_sched_job_init(&job->base, &ring->sched,
entity, owner, entity,
amdgpu_job_timeout_func, amdgpu_job_timeout_func,
&fence); amdgpu_job_free_func,
owner, &fence);
if (r) if (r)
return r; return r;
......
...@@ -333,7 +333,8 @@ void amd_sched_job_finish(struct amd_sched_job *s_job) ...@@ -333,7 +333,8 @@ void amd_sched_job_finish(struct amd_sched_job *s_job)
struct amd_gpu_scheduler *sched = s_job->sched; struct amd_gpu_scheduler *sched = s_job->sched;
if (sched->timeout != MAX_SCHEDULE_TIMEOUT) { if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */ if (cancel_delayed_work(&s_job->work_tdr))
amd_sched_job_put(s_job);
/* queue TDR for next job */ /* queue TDR for next job */
next = list_first_entry_or_null(&sched->ring_mirror_list, next = list_first_entry_or_null(&sched->ring_mirror_list,
...@@ -341,6 +342,7 @@ void amd_sched_job_finish(struct amd_sched_job *s_job) ...@@ -341,6 +342,7 @@ void amd_sched_job_finish(struct amd_sched_job *s_job)
if (next) { if (next) {
INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback); INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback);
amd_sched_job_get(next);
schedule_delayed_work(&next->work_tdr, sched->timeout); schedule_delayed_work(&next->work_tdr, sched->timeout);
} }
} }
...@@ -354,6 +356,7 @@ void amd_sched_job_begin(struct amd_sched_job *s_job) ...@@ -354,6 +356,7 @@ void amd_sched_job_begin(struct amd_sched_job *s_job)
list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job) list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job)
{ {
INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback); INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback);
amd_sched_job_get(s_job);
schedule_delayed_work(&s_job->work_tdr, sched->timeout); schedule_delayed_work(&s_job->work_tdr, sched->timeout);
} }
} }
...@@ -382,9 +385,11 @@ int amd_sched_job_init(struct amd_sched_job *job, ...@@ -382,9 +385,11 @@ int amd_sched_job_init(struct amd_sched_job *job,
struct amd_gpu_scheduler *sched, struct amd_gpu_scheduler *sched,
struct amd_sched_entity *entity, struct amd_sched_entity *entity,
void (*timeout_cb)(struct work_struct *work), void (*timeout_cb)(struct work_struct *work),
void (*free_cb)(struct kref *refcount),
void *owner, struct fence **fence) void *owner, struct fence **fence)
{ {
INIT_LIST_HEAD(&job->node); INIT_LIST_HEAD(&job->node);
kref_init(&job->refcount);
job->sched = sched; job->sched = sched;
job->s_entity = entity; job->s_entity = entity;
job->s_fence = amd_sched_fence_create(entity, owner); job->s_fence = amd_sched_fence_create(entity, owner);
...@@ -393,6 +398,7 @@ int amd_sched_job_init(struct amd_sched_job *job, ...@@ -393,6 +398,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
job->s_fence->s_job = job; job->s_fence->s_job = job;
job->timeout_callback = timeout_cb; job->timeout_callback = timeout_cb;
job->free_callback = free_cb;
if (fence) if (fence)
*fence = &job->s_fence->base; *fence = &job->s_fence->base;
......
...@@ -78,6 +78,7 @@ struct amd_sched_fence { ...@@ -78,6 +78,7 @@ struct amd_sched_fence {
}; };
struct amd_sched_job { struct amd_sched_job {
struct kref refcount;
struct amd_gpu_scheduler *sched; struct amd_gpu_scheduler *sched;
struct amd_sched_entity *s_entity; struct amd_sched_entity *s_entity;
struct amd_sched_fence *s_fence; struct amd_sched_fence *s_fence;
...@@ -87,6 +88,7 @@ struct amd_sched_job { ...@@ -87,6 +88,7 @@ struct amd_sched_job {
struct list_head node; struct list_head node;
struct delayed_work work_tdr; struct delayed_work work_tdr;
void (*timeout_callback) (struct work_struct *work); void (*timeout_callback) (struct work_struct *work);
void (*free_callback)(struct kref *refcount);
}; };
extern const struct fence_ops amd_sched_fence_ops; extern const struct fence_ops amd_sched_fence_ops;
...@@ -155,9 +157,20 @@ int amd_sched_job_init(struct amd_sched_job *job, ...@@ -155,9 +157,20 @@ int amd_sched_job_init(struct amd_sched_job *job,
struct amd_gpu_scheduler *sched, struct amd_gpu_scheduler *sched,
struct amd_sched_entity *entity, struct amd_sched_entity *entity,
void (*timeout_cb)(struct work_struct *work), void (*timeout_cb)(struct work_struct *work),
void (*free_cb)(struct kref* refcount),
void *owner, struct fence **fence); void *owner, struct fence **fence);
void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched , void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched ,
struct amd_sched_job *s_job); struct amd_sched_job *s_job);
void amd_sched_job_finish(struct amd_sched_job *s_job); void amd_sched_job_finish(struct amd_sched_job *s_job);
void amd_sched_job_begin(struct amd_sched_job *s_job); void amd_sched_job_begin(struct amd_sched_job *s_job);
static inline void amd_sched_job_get(struct amd_sched_job *job) {
if (job)
kref_get(&job->refcount);
}
static inline void amd_sched_job_put(struct amd_sched_job *job) {
if (job)
kref_put(&job->refcount, job->free_callback);
}
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment