Commit b8f67b9d authored by Shashank Sharma's avatar Shashank Sharma Committed by Alex Deucher

drm/amdgpu: change vm->task_info handling

This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
  reference counted.
- introducing two new helper funcs for task_info lifecycle management
    - amdgpu_vm_get_task_info: reference counts up task_info before
      returning this info
    - amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.

This patch also does logistical changes required for existing usage
of vm->task_info.

V2: Do not block all the prints when task_info not found (Felix)

V3: Fixed review comments from Felix
   - Fix wrong indentation
   - No debug message for -ENOMEM
   - Add NULL check for task_info
   - Do not duplicate the debug messages (ti vs no ti)
   - Get first reference of task_info in vm_init(), put last
     in vm_fini()

V4: Fixed review comments from Felix
   - fix double reference increment in create_task_info
   - change amdgpu_vm_get_task_info_pasid
   - additional changes in amdgpu_gem.c while porting

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarShashank Sharma <shashank.sharma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 68e05b93
...@@ -1782,9 +1782,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused) ...@@ -1782,9 +1782,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
list_for_each_entry(file, &dev->filelist, lhead) { list_for_each_entry(file, &dev->filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv; struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_task_info *ti;
ti = amdgpu_vm_get_task_info_vm(vm);
if (ti) {
seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
amdgpu_vm_put_task_info(ti);
}
seq_printf(m, "pid:%d\tProcess:%s ----------\n",
vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true); r = amdgpu_bo_reserve(vm->root.bo, true);
if (r) if (r)
break; break;
......
...@@ -208,9 +208,15 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, ...@@ -208,9 +208,15 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
if (!WARN_ON(!vm->process_info->eviction_fence)) { if (!WARN_ON(!vm->process_info->eviction_fence)) {
r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT, r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT,
&vm->process_info->eviction_fence->base); &vm->process_info->eviction_fence->base);
if (r) if (r) {
dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n", struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);
vm->task_info.pid, r);
dev_warn(adev->dev, "validate_and_fence failed: %d\n", r);
if (ti) {
dev_warn(adev->dev, "pid %d\n", ti->pid);
amdgpu_vm_put_task_info(ti);
}
}
} }
mutex_unlock(&vm->process_info->lock); mutex_unlock(&vm->process_info->lock);
......
...@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ...@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{ {
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job); struct amdgpu_job *job = to_amdgpu_job(s_job);
struct amdgpu_task_info ti; struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev; struct amdgpu_device *adev = ring->adev;
int idx; int idx;
int r; int r;
...@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ...@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV; return DRM_GPU_SCHED_STAT_ENODEV;
} }
memset(&ti, 0, sizeof(struct amdgpu_task_info));
adev->job_hang = true; adev->job_hang = true;
if (amdgpu_gpu_recovery && if (amdgpu_gpu_recovery &&
...@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ...@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
goto exit; goto exit;
} }
amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
ring->fence_drv.sync_seq); ring->fence_drv.sync_seq);
DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
ti.process_name, ti.tgid, ti.task_name, ti.pid); ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
if (ti) {
DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
ti->process_name, ti->tgid, ti->task_name, ti->pid);
amdgpu_vm_put_task_info(ti);
}
dma_fence_set_error(&s_job->s_fence->finished, -ETIME); dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
......
...@@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, ...@@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
coredump->reset_vram_lost = vram_lost; coredump->reset_vram_lost = vram_lost;
if (reset_context->job && reset_context->job->vm) if (reset_context->job && reset_context->job->vm) {
coredump->reset_task_info = reset_context->job->vm->task_info; struct amdgpu_task_info *ti;
struct amdgpu_vm *vm = reset_context->job->vm;
ti = amdgpu_vm_get_task_info_vm(vm);
if (ti) {
coredump->reset_task_info = *ti;
amdgpu_vm_put_task_info(ti);
}
}
coredump->adev = adev; coredump->adev = adev;
......
...@@ -513,8 +513,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -513,8 +513,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
bo = bo_base->bo; bo = bo_base->bo;
if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) { if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) {
pr_warn_ratelimited("Evicted user BO is not reserved in pid %d\n", struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);
vm->task_info.pid);
pr_warn_ratelimited("Evicted user BO is not reserved\n");
if (ti) {
pr_warn_ratelimited("pid %d\n", ti->pid);
amdgpu_vm_put_task_info(ti);
}
return -EINVAL; return -EINVAL;
} }
...@@ -2221,6 +2227,108 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout) ...@@ -2221,6 +2227,108 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
return dma_fence_wait_timeout(vm->last_unlocked, true, timeout); return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
} }
static void amdgpu_vm_destroy_task_info(struct kref *kref)
{
struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);
kfree(ti);
}
static inline struct amdgpu_vm *
amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
{
struct amdgpu_vm *vm;
unsigned long flags;
xa_lock_irqsave(&adev->vm_manager.pasids, flags);
vm = xa_load(&adev->vm_manager.pasids, pasid);
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
return vm;
}
/**
* amdgpu_vm_put_task_info - reference down the vm task_info ptr
*
* @task_info: task_info struct under discussion.
*
* frees the vm task_info ptr at the last put
*/
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
{
kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
}
/**
* amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
*
* @vm: VM to get info from
*
* Returns the reference counted task_info structure, which must be
* referenced down with amdgpu_vm_put_task_info.
*/
struct amdgpu_task_info *
amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
{
struct amdgpu_task_info *ti = NULL;
if (vm) {
ti = vm->task_info;
kref_get(&vm->task_info->refcount);
}
return ti;
}
/**
* amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
*
* @adev: drm device pointer
* @pasid: PASID identifier for VM
*
* Returns the reference counted task_info structure, which must be
* referenced down with amdgpu_vm_put_task_info.
*/
struct amdgpu_task_info *
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
{
return amdgpu_vm_get_task_info_vm(
amdgpu_vm_get_vm_from_pasid(adev, pasid));
}
static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
{
vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
if (!vm->task_info)
return -ENOMEM;
kref_init(&vm->task_info->refcount);
return 0;
}
/**
* amdgpu_vm_set_task_info - Sets VMs task info.
*
* @vm: vm for which to set the info
*/
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
if (!vm->task_info)
return;
if (vm->task_info->pid == current->pid)
return;
vm->task_info->pid = current->pid;
get_task_comm(vm->task_info->task_name, current);
if (current->group_leader->mm != current->mm)
return;
vm->task_info->tgid = current->group_leader->pid;
get_task_comm(vm->task_info->process_name, current->group_leader);
}
/** /**
* amdgpu_vm_init - initialize a vm instance * amdgpu_vm_init - initialize a vm instance
* *
...@@ -2306,6 +2414,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -2306,6 +2414,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (r) if (r)
goto error_free_root; goto error_free_root;
r = amdgpu_vm_create_task_info(vm);
if (r)
DRM_DEBUG("Failed to create task info for VM\n");
amdgpu_bo_unreserve(vm->root.bo); amdgpu_bo_unreserve(vm->root.bo);
amdgpu_bo_unref(&root_bo); amdgpu_bo_unref(&root_bo);
...@@ -2427,6 +2539,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -2427,6 +2539,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
root = amdgpu_bo_ref(vm->root.bo); root = amdgpu_bo_ref(vm->root.bo);
amdgpu_bo_reserve(root, true); amdgpu_bo_reserve(root, true);
amdgpu_vm_put_task_info(vm->task_info);
amdgpu_vm_set_pasid(adev, vm, 0); amdgpu_vm_set_pasid(adev, vm, 0);
dma_fence_wait(vm->last_unlocked, false); dma_fence_wait(vm->last_unlocked, false);
dma_fence_put(vm->last_unlocked); dma_fence_put(vm->last_unlocked);
...@@ -2583,48 +2696,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) ...@@ -2583,48 +2696,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
return 0; return 0;
} }
/**
* amdgpu_vm_get_task_info - Extracts task info for a PASID.
*
* @adev: drm device pointer
* @pasid: PASID identifier for VM
* @task_info: task_info to fill.
*/
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
struct amdgpu_task_info *task_info)
{
struct amdgpu_vm *vm;
unsigned long flags;
xa_lock_irqsave(&adev->vm_manager.pasids, flags);
vm = xa_load(&adev->vm_manager.pasids, pasid);
if (vm)
*task_info = vm->task_info;
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
}
/**
* amdgpu_vm_set_task_info - Sets VMs task info.
*
* @vm: vm for which to set the info
*/
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
{
if (vm->task_info.pid)
return;
vm->task_info.pid = current->pid;
get_task_comm(vm->task_info.task_name, current);
if (current->group_leader->mm != current->mm)
return;
vm->task_info.tgid = current->group_leader->pid;
get_task_comm(vm->task_info.process_name, current->group_leader);
}
/** /**
* amdgpu_vm_handle_fault - graceful handling of VM faults. * amdgpu_vm_handle_fault - graceful handling of VM faults.
* @adev: amdgpu device pointer * @adev: amdgpu device pointer
......
...@@ -203,10 +203,11 @@ struct amdgpu_vm_pte_funcs { ...@@ -203,10 +203,11 @@ struct amdgpu_vm_pte_funcs {
}; };
struct amdgpu_task_info { struct amdgpu_task_info {
char process_name[TASK_COMM_LEN]; char process_name[TASK_COMM_LEN];
char task_name[TASK_COMM_LEN]; char task_name[TASK_COMM_LEN];
pid_t pid; pid_t pid;
pid_t tgid; pid_t tgid;
struct kref refcount;
}; };
/** /**
...@@ -370,7 +371,7 @@ struct amdgpu_vm { ...@@ -370,7 +371,7 @@ struct amdgpu_vm {
uint64_t pd_phys_addr; uint64_t pd_phys_addr;
/* Some basic info about the task */ /* Some basic info about the task */
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
/* Store positions of group of BOs */ /* Store positions of group of BOs */
struct ttm_lru_bulk_move lru_bulk_move; struct ttm_lru_bulk_move lru_bulk_move;
...@@ -511,8 +512,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, ...@@ -511,8 +512,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
struct amdgpu_job *job); struct amdgpu_job *job);
void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid, struct amdgpu_task_info *
struct amdgpu_task_info *task_info); amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
struct amdgpu_task_info *
amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr, u32 vmid, u32 node_id, uint64_t addr,
bool write_fault); bool write_fault);
......
...@@ -973,7 +973,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, ...@@ -973,7 +973,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end, trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
min(nptes, 32u), dst, incr, min(nptes, 32u), dst, incr,
upd_flags, upd_flags,
vm->task_info.tgid, vm->task_info ? vm->task_info->tgid : 0,
vm->immediate.fence_context); vm->immediate.fence_context);
amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt), amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
cursor.level, pe_start, dst, cursor.level, pe_start, dst,
......
...@@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, ...@@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index]; struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
bool retry_fault = !!(entry->src_data[1] & 0x80); bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20); bool write_fault = !!(entry->src_data[1] & 0x20);
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
uint32_t status = 0; uint32_t status = 0;
u64 addr; u64 addr;
...@@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, ...@@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
if (!printk_ratelimit()) if (!printk_ratelimit())
return 0; return 0;
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
dev_err(adev->dev, dev_err(adev->dev,
"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n", "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
entry->vmid_src ? "mmhub" : "gfxhub", entry->vmid_src ? "mmhub" : "gfxhub",
entry->src_id, entry->ring_id, entry->vmid, entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
entry->pasid, task_info.process_name, task_info.tgid, task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
task_info.task_name, task_info.pid); if (task_info) {
dev_err(adev->dev,
" in process %s pid %d thread %s pid %d\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n", dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n",
addr, entry->client_id, addr, entry->client_id,
soc15_ih_clientid_name[entry->client_id]); soc15_ih_clientid_name[entry->client_id]);
if (!amdgpu_sriov_vf(adev)) if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev, hub->vmhub_funcs->print_l2_protection_fault_status(adev,
......
...@@ -126,19 +126,24 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev, ...@@ -126,19 +126,24 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
} }
if (printk_ratelimit()) { if (printk_ratelimit()) {
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
dev_err(adev->dev, dev_err(adev->dev,
"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n", "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
entry->vmid_src ? "mmhub" : "gfxhub", entry->vmid_src ? "mmhub" : "gfxhub",
entry->src_id, entry->ring_id, entry->vmid, entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
entry->pasid, task_info.process_name, task_info.tgid, task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
task_info.task_name, task_info.pid); if (task_info) {
dev_err(adev->dev,
" in process %s pid %d thread %s pid %d)\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n", dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n",
addr, entry->client_id); addr, entry->client_id);
if (!amdgpu_sriov_vf(adev)) if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev, status); hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
} }
......
...@@ -1445,18 +1445,24 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev, ...@@ -1445,18 +1445,24 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
gmc_v8_0_set_fault_enable_default(adev, false); gmc_v8_0_set_fault_enable_default(adev, false);
if (printk_ratelimit()) { if (printk_ratelimit()) {
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
memset(&task_info, 0, sizeof(struct amdgpu_task_info)); dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); entry->src_id, entry->src_data[0]);
task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
if (task_info) {
dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
entry->src_id, entry->src_data[0], task_info.process_name,
task_info.tgid, task_info.task_name, task_info.pid);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n", dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
addr); addr);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n", dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
status); status);
gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client, gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
entry->pasid); entry->pasid);
} }
......
...@@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, ...@@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
bool retry_fault = !!(entry->src_data[1] & 0x80); bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20); bool write_fault = !!(entry->src_data[1] & 0x20);
uint32_t status = 0, cid = 0, rw = 0; uint32_t status = 0, cid = 0, rw = 0;
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
struct amdgpu_vmhub *hub; struct amdgpu_vmhub *hub;
const char *mmhub_cid; const char *mmhub_cid;
const char *hub_name; const char *hub_name;
...@@ -626,15 +626,20 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, ...@@ -626,15 +626,20 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
if (!printk_ratelimit()) if (!printk_ratelimit())
return 0; return 0;
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
dev_err(adev->dev, dev_err(adev->dev,
"[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n", "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name,
hub_name, retry_fault ? "retry" : "no-retry", retry_fault ? "retry" : "no-retry",
entry->src_id, entry->ring_id, entry->vmid, entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
entry->pasid, task_info.process_name, task_info.tgid,
task_info.task_name, task_info.pid); task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
if (task_info) {
dev_err(adev->dev,
" for process %s pid %d thread %s pid %d)\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
dev_err(adev->dev, " in page starting at address 0x%016llx from IH client 0x%x (%s)\n", dev_err(adev->dev, " in page starting at address 0x%016llx from IH client 0x%x (%s)\n",
addr, entry->client_id, addr, entry->client_id,
soc15_ih_clientid_name[entry->client_id]); soc15_ih_clientid_name[entry->client_id]);
......
...@@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev, ...@@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
int instance; int instance;
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
u64 addr; u64 addr;
instance = sdma_v4_0_irq_id_to_seq(entry->client_id); instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
...@@ -2116,15 +2116,20 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev, ...@@ -2116,15 +2116,20 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12; addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44; addr |= ((u64)entry->src_data[1] & 0xf) << 44;
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
dev_dbg_ratelimited(adev->dev, dev_dbg_ratelimited(adev->dev,
"[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u " "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
"pasid:%u, for process %s pid %d thread %s pid %d\n", instance, addr, entry->src_id, entry->ring_id, entry->vmid,
instance, addr, entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
entry->pasid, task_info.process_name, task_info.tgid,
task_info.task_name, task_info.pid); task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
if (task_info) {
dev_dbg_ratelimited(adev->dev,
" for process %s pid %d thread %s pid %d\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
return 0; return 0;
} }
......
...@@ -1644,7 +1644,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev, ...@@ -1644,7 +1644,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
int instance; int instance;
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
u64 addr; u64 addr;
instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id); instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
...@@ -1656,15 +1656,19 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev, ...@@ -1656,15 +1656,19 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12; addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44; addr |= ((u64)entry->src_data[1] & 0xf) << 44;
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
dev_dbg_ratelimited(adev->dev, dev_dbg_ratelimited(adev->dev,
"[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u " "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
"pasid:%u, for process %s pid %d thread %s pid %d\n", instance, addr, entry->src_id, entry->ring_id, entry->vmid,
instance, addr, entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
entry->pasid, task_info.process_name, task_info.tgid,
task_info.task_name, task_info.pid); task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
if (task_info) {
dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
task_info->process_name, task_info->tgid,
task_info->task_name, task_info->pid);
amdgpu_vm_put_task_info(task_info);
}
return 0; return 0;
} }
......
...@@ -238,16 +238,16 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, ...@@ -238,16 +238,16 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
{ {
struct amdgpu_task_info task_info; struct amdgpu_task_info *task_info;
memset(&task_info, 0, sizeof(struct amdgpu_task_info)); task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
amdgpu_vm_get_task_info(dev->adev, pasid, &task_info); if (task_info) {
/* Report VM faults from user applications, not retry from kernel */ /* Report VM faults from user applications, not retry from kernel */
if (!task_info.pid) if (task_info->pid)
return; kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
task_info->pid, task_info->task_name);
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", amdgpu_vm_put_task_info(task_info);
task_info.pid, task_info.task_name); }
} }
void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment