Commit 8af13c3f authored by Matthew Brost's avatar Matthew Brost

drm/xe: Store process name and pid in xe file

An xe file can outlive the associated process as the GPU cleanup is just
triggered upon file close (process kill) and completes sometime later.
If the file close triggers error conditions (GPU hangs) the process
cannot be safely referenced to retrieve the name and pid for debug
information. Store the process name and pid directly in the xe file to
be safe.

v2:
 - Access file->pid via rcu_access_pointer (Matthew Auld)

Fixes: b10d0c5e ("drm/xe: Add process name to devcoredump")
Fixes: f6ca930d ("drm/xe: Add process name and PID to job timedout message")
Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Acked-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: default avatarMatthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240723151045.1725417-1-matthew.brost@intel.com
parent c8a31ff6
...@@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump, ...@@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
u32 adj_logical_mask = q->logical_mask; u32 adj_logical_mask = q->logical_mask;
u32 width_mask = (0x1 << q->width) - 1; u32 width_mask = (0x1 << q->width) - 1;
const char *process_name = "no process"; const char *process_name = "no process";
struct task_struct *task = NULL;
int i; int i;
bool cookie; bool cookie;
...@@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump, ...@@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
ss->snapshot_time = ktime_get_real(); ss->snapshot_time = ktime_get_real();
ss->boot_time = ktime_get_boottime(); ss->boot_time = ktime_get_boottime();
if (q->vm && q->vm->xef) { if (q->vm && q->vm->xef)
task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID); process_name = q->vm->xef->process_name;
if (task)
process_name = task->comm;
}
strscpy(ss->process_name, process_name); strscpy(ss->process_name, process_name);
if (task)
put_task_struct(task);
ss->gt = q->gt; ss->gt = q->gt;
INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work); INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
......
...@@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) ...@@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
struct xe_drm_client *client; struct xe_drm_client *client;
struct xe_file *xef; struct xe_file *xef;
int ret = -ENOMEM; int ret = -ENOMEM;
struct task_struct *task = NULL;
xef = kzalloc(sizeof(*xef), GFP_KERNEL); xef = kzalloc(sizeof(*xef), GFP_KERNEL);
if (!xef) if (!xef)
...@@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file) ...@@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
file->driver_priv = xef; file->driver_priv = xef;
kref_init(&xef->refcount); kref_init(&xef->refcount);
task = get_pid_task(rcu_access_pointer(file->pid), PIDTYPE_PID);
if (task) {
xef->process_name = kstrdup(task->comm, GFP_KERNEL);
xef->pid = task->pid;
put_task_struct(task);
}
return 0; return 0;
} }
...@@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref) ...@@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
spin_unlock(&xe->clients.lock); spin_unlock(&xe->clients.lock);
xe_drm_client_put(xef->client); xe_drm_client_put(xef->client);
kfree(xef->process_name);
kfree(xef); kfree(xef);
} }
......
...@@ -582,6 +582,18 @@ struct xe_file { ...@@ -582,6 +582,18 @@ struct xe_file {
/** @client: drm client */ /** @client: drm client */
struct xe_drm_client *client; struct xe_drm_client *client;
/**
* @process_name: process name for file handle, used to safely output
* during error situations where xe file can outlive process
*/
char *process_name;
/**
* @pid: pid for file handle, used to safely output uring error
* situations where xe file can outlive process
*/
pid_t pid;
/** @refcount: ref count of this xe file */ /** @refcount: ref count of this xe file */
struct kref refcount; struct kref refcount;
}; };
......
...@@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) ...@@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
struct xe_gpu_scheduler *sched = &q->guc->sched; struct xe_gpu_scheduler *sched = &q->guc->sched;
struct xe_guc *guc = exec_queue_to_guc(q); struct xe_guc *guc = exec_queue_to_guc(q);
const char *process_name = "no process"; const char *process_name = "no process";
struct task_struct *task = NULL;
int err = -ETIME; int err = -ETIME;
pid_t pid = -1; pid_t pid = -1;
int i = 0; int i = 0;
...@@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) ...@@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
} }
if (q->vm && q->vm->xef) { if (q->vm && q->vm->xef) {
task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID); process_name = q->vm->xef->process_name;
if (task) { pid = q->vm->xef->pid;
process_name = task->comm;
pid = task->pid;
}
} }
xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]", xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
q->guc->id, q->flags, process_name, pid); q->guc->id, q->flags, process_name, pid);
if (task)
put_task_struct(task);
trace_xe_sched_job_timedout(job); trace_xe_sched_job_timedout(job);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment