Commit 50661eb1 authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdgpu: Auto-validate DMABuf imports in compute VMs

DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the
process_info->kfd_bo_list. There is no explicit KFD API call to validate
them or add eviction fences to them.

This patch automatically validates and fences dymanic DMABuf imports when
they are added to a compute VM. Revalidation after evictions is handled
in the VM code.

v2:
* Renamed amdgpu_vm_validate_evicted_bos to amdgpu_vm_validate
* Eliminated evicted_user state, use evicted state for VM BOs and user BOs
* Fixed and simplified amdgpu_vm_fence_imports, depends on reserved BOs
* Moved dma_resv_reserve_fences for amdgpu_vm_fence_imports into
  amdgpu_vm_validate, outside the vm->status_lock
* Added dummy version of amdgpu_amdkfd_bo_validate_and_fence for builds
  without KFD

v4: Eliminate amdgpu_vm_fence_imports. It's not needed because the
reservation with its fences is shared with the export, as long as all
imports are from KFD, with the exports already reserved, validated and
fenced by the KFD restore worker.

v5: Reintroduced separate evicted_user state to simplify the state machine
and CS error handling when amdgpu_vm_validate is called without a ticket.
Signed-off-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d7643fe6
...@@ -191,6 +191,9 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); ...@@ -191,6 +191,9 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo); int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
unsigned long cur_seq, struct kgd_mem *mem); unsigned long cur_seq, struct kgd_mem *mem);
int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
uint32_t domain,
struct dma_fence *fence);
#else #else
static inline static inline
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
...@@ -216,6 +219,13 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, ...@@ -216,6 +219,13 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
{ {
return 0; return 0;
} }
static inline
int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
uint32_t domain,
struct dma_fence *fence)
{
return 0;
}
#endif #endif
/* Shared API */ /* Shared API */
int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size, int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
......
...@@ -426,9 +426,9 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, ...@@ -426,9 +426,9 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
return ret; return ret;
} }
static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
uint32_t domain, uint32_t domain,
struct dma_fence *fence) struct dma_fence *fence)
{ {
int ret = amdgpu_bo_reserve(bo, false); int ret = amdgpu_bo_reserve(bo, false);
...@@ -464,13 +464,15 @@ static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) ...@@ -464,13 +464,15 @@ static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
* again. Page directories are only updated after updating page * again. Page directories are only updated after updating page
* tables. * tables.
*/ */
static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm) static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,
struct ww_acquire_ctx *ticket)
{ {
struct amdgpu_bo *pd = vm->root.bo; struct amdgpu_bo *pd = vm->root.bo;
struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev); struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
int ret; int ret;
ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate_vm_bo, NULL); ret = amdgpu_vm_validate(adev, vm, ticket,
amdgpu_amdkfd_validate_vm_bo, NULL);
if (ret) { if (ret) {
pr_err("failed to validate PT BOs\n"); pr_err("failed to validate PT BOs\n");
return ret; return ret;
...@@ -1310,14 +1312,15 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem, ...@@ -1310,14 +1312,15 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem,
return ret; return ret;
} }
static int process_validate_vms(struct amdkfd_process_info *process_info) static int process_validate_vms(struct amdkfd_process_info *process_info,
struct ww_acquire_ctx *ticket)
{ {
struct amdgpu_vm *peer_vm; struct amdgpu_vm *peer_vm;
int ret; int ret;
list_for_each_entry(peer_vm, &process_info->vm_list_head, list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) { vm_list_node) {
ret = vm_validate_pt_pd_bos(peer_vm); ret = vm_validate_pt_pd_bos(peer_vm, ticket);
if (ret) if (ret)
return ret; return ret;
} }
...@@ -1402,7 +1405,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, ...@@ -1402,7 +1405,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
ret = amdgpu_bo_reserve(vm->root.bo, true); ret = amdgpu_bo_reserve(vm->root.bo, true);
if (ret) if (ret)
goto reserve_pd_fail; goto reserve_pd_fail;
ret = vm_validate_pt_pd_bos(vm); ret = vm_validate_pt_pd_bos(vm, NULL);
if (ret) { if (ret) {
pr_err("validate_pt_pd_bos() failed\n"); pr_err("validate_pt_pd_bos() failed\n");
goto validate_pd_fail; goto validate_pd_fail;
...@@ -2043,7 +2046,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( ...@@ -2043,7 +2046,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
bo->tbo.resource->mem_type == TTM_PL_SYSTEM) bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
is_invalid_userptr = true; is_invalid_userptr = true;
ret = vm_validate_pt_pd_bos(avm); ret = vm_validate_pt_pd_bos(avm, NULL);
if (unlikely(ret)) if (unlikely(ret))
goto out_unreserve; goto out_unreserve;
...@@ -2122,7 +2125,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( ...@@ -2122,7 +2125,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
goto unreserve_out; goto unreserve_out;
} }
ret = vm_validate_pt_pd_bos(avm); ret = vm_validate_pt_pd_bos(avm, NULL);
if (unlikely(ret)) if (unlikely(ret))
goto unreserve_out; goto unreserve_out;
...@@ -2620,7 +2623,7 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) ...@@ -2620,7 +2623,7 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
} }
} }
ret = process_validate_vms(process_info); ret = process_validate_vms(process_info, NULL);
if (ret) if (ret)
goto unreserve_out; goto unreserve_out;
...@@ -2880,11 +2883,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * ...@@ -2880,11 +2883,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
amdgpu_sync_create(&sync_obj); amdgpu_sync_create(&sync_obj);
/* Validate PDs and PTs */
ret = process_validate_vms(process_info);
if (ret)
goto validate_map_fail;
/* Validate BOs and map them to GPUVM (update VM page tables). */ /* Validate BOs and map them to GPUVM (update VM page tables). */
list_for_each_entry(mem, &process_info->kfd_bo_list, list_for_each_entry(mem, &process_info->kfd_bo_list,
validate_list) { validate_list) {
...@@ -2935,6 +2933,13 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * ...@@ -2935,6 +2933,13 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
if (failed_size) if (failed_size)
pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
/* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
* validations above would invalidate DMABuf imports again.
*/
ret = process_validate_vms(process_info, &exec.ticket);
if (ret)
goto validate_map_fail;
/* Update mappings not managed by KFD */ /* Update mappings not managed by KFD */
list_for_each_entry(peer_vm, &process_info->vm_list_head, list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) { vm_list_node) {
...@@ -3006,7 +3011,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * ...@@ -3006,7 +3011,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
&process_info->eviction_fence->base, &process_info->eviction_fence->base,
DMA_RESV_USAGE_BOOKKEEP); DMA_RESV_USAGE_BOOKKEEP);
} }
/* Attach eviction fence to PD / PT BOs */ /* Attach eviction fence to PD / PT BOs and DMABuf imports */
list_for_each_entry(peer_vm, &process_info->vm_list_head, list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) { vm_list_node) {
struct amdgpu_bo *bo = peer_vm->root.bo; struct amdgpu_bo *bo = peer_vm->root.bo;
......
...@@ -952,10 +952,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, ...@@ -952,10 +952,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
p->bytes_moved = 0; p->bytes_moved = 0;
p->bytes_moved_vis = 0; p->bytes_moved_vis = 0;
r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm, r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
amdgpu_cs_bo_validate, p); amdgpu_cs_bo_validate, p);
if (r) { if (r) {
DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n"); DRM_ERROR("amdgpu_vm_validate() failed.\n");
goto out_free_user_pages; goto out_free_user_pages;
} }
......
...@@ -377,6 +377,10 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) ...@@ -377,6 +377,10 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
struct amdgpu_vm_bo_base *bo_base; struct amdgpu_vm_bo_base *bo_base;
int r; int r;
/* FIXME: This should be after the "if", but needs a fix to make sure
* DMABuf imports are initialized in the right VM list.
*/
amdgpu_vm_bo_invalidate(adev, bo, false);
if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM) if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
return; return;
......
...@@ -187,7 +187,34 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, ...@@ -187,7 +187,34 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
else else
++bo_va->ref_count; ++bo_va->ref_count;
amdgpu_bo_unreserve(abo); amdgpu_bo_unreserve(abo);
return 0;
/* Validate and add eviction fence to DMABuf imports with dynamic
* attachment in compute VMs. Re-validation will be done by
* amdgpu_vm_validate. Fences are on the reservation shared with the
* export, which is currently required to be validated and fenced
* already by amdgpu_amdkfd_gpuvm_restore_process_bos.
*
* Nested locking below for the case that a GEM object is opened in
* kfd_mem_export_dmabuf. Since the lock below is only taken for imports,
* but not for export, this is a different lock class that cannot lead to
* circular lock dependencies.
*/
if (!vm->is_compute_context || !vm->process_info)
return 0;
if (!obj->import_attach ||
!dma_buf_is_dynamic(obj->import_attach->dmabuf))
return 0;
mutex_lock_nested(&vm->process_info->lock, 1);
if (!WARN_ON(!vm->process_info->eviction_fence)) {
r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT,
&vm->process_info->eviction_fence->base);
if (r)
dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
vm->task_info.pid, r);
}
mutex_unlock(&vm->process_info->lock);
return r;
} }
static void amdgpu_gem_object_close(struct drm_gem_object *obj, static void amdgpu_gem_object_close(struct drm_gem_object *obj,
......
...@@ -233,6 +233,22 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) ...@@ -233,6 +233,22 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
spin_unlock(&vm_bo->vm->status_lock); spin_unlock(&vm_bo->vm->status_lock);
} }
/**
* amdgpu_vm_bo_evicted_user - vm_bo is evicted
*
* @vm_bo: vm_bo which is evicted
*
* State for BOs used by user mode queues which are not at the location they
* should be.
*/
static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo)
{
vm_bo->moved = true;
spin_lock(&vm_bo->vm->status_lock);
list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user);
spin_unlock(&vm_bo->vm->status_lock);
}
/** /**
* amdgpu_vm_bo_relocated - vm_bo is reloacted * amdgpu_vm_bo_relocated - vm_bo is reloacted
* *
...@@ -427,21 +443,25 @@ uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -427,21 +443,25 @@ uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm)
} }
/** /**
* amdgpu_vm_validate_pt_bos - validate the page table BOs * amdgpu_vm_validate - validate evicted BOs tracked in the VM
* *
* @adev: amdgpu device pointer * @adev: amdgpu device pointer
* @vm: vm providing the BOs * @vm: vm providing the BOs
* @ticket: optional reservation ticket used to reserve the VM
* @validate: callback to do the validation * @validate: callback to do the validation
* @param: parameter for the validation callback * @param: parameter for the validation callback
* *
* Validate the page table BOs on command submission if neccessary. * Validate the page table BOs and per-VM BOs on command submission if
* necessary. If a ticket is given, also try to validate evicted user queue
* BOs. They must already be reserved with the given ticket.
* *
* Returns: * Returns:
* Validation result. * Validation result.
*/ */
int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
int (*validate)(void *p, struct amdgpu_bo *bo), struct ww_acquire_ctx *ticket,
void *param) int (*validate)(void *p, struct amdgpu_bo *bo),
void *param)
{ {
struct amdgpu_vm_bo_base *bo_base; struct amdgpu_vm_bo_base *bo_base;
struct amdgpu_bo *shadow; struct amdgpu_bo *shadow;
...@@ -484,6 +504,28 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -484,6 +504,28 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
} }
spin_lock(&vm->status_lock); spin_lock(&vm->status_lock);
} }
while (ticket && !list_empty(&vm->evicted_user)) {
bo_base = list_first_entry(&vm->evicted_user,
struct amdgpu_vm_bo_base,
vm_status);
spin_unlock(&vm->status_lock);
bo = bo_base->bo;
if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) {
pr_warn_ratelimited("Evicted user BO is not reserved in pid %d\n",
vm->task_info.pid);
return -EINVAL;
}
r = validate(param, bo);
if (r)
return r;
amdgpu_vm_bo_invalidated(bo_base);
spin_lock(&vm->status_lock);
}
spin_unlock(&vm->status_lock); spin_unlock(&vm->status_lock);
amdgpu_vm_eviction_lock(vm); amdgpu_vm_eviction_lock(vm);
...@@ -1426,11 +1468,21 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, ...@@ -1426,11 +1468,21 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
} }
r = amdgpu_vm_bo_update(adev, bo_va, clear); r = amdgpu_vm_bo_update(adev, bo_va, clear);
if (r)
return r;
if (unlock) if (unlock)
dma_resv_unlock(resv); dma_resv_unlock(resv);
if (r)
return r;
/* Remember evicted DMABuf imports in compute VMs for later
* validation
*/
if (vm->is_compute_context && bo_va->base.bo &&
bo_va->base.bo->tbo.base.import_attach &&
(!bo_va->base.bo->tbo.resource ||
bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
amdgpu_vm_bo_evicted_user(&bo_va->base);
spin_lock(&vm->status_lock); spin_lock(&vm->status_lock);
} }
spin_unlock(&vm->status_lock); spin_unlock(&vm->status_lock);
...@@ -2196,6 +2248,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -2196,6 +2248,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
vm->reserved_vmid[i] = NULL; vm->reserved_vmid[i] = NULL;
INIT_LIST_HEAD(&vm->evicted); INIT_LIST_HEAD(&vm->evicted);
INIT_LIST_HEAD(&vm->evicted_user);
INIT_LIST_HEAD(&vm->relocated); INIT_LIST_HEAD(&vm->relocated);
INIT_LIST_HEAD(&vm->moved); INIT_LIST_HEAD(&vm->moved);
INIT_LIST_HEAD(&vm->idle); INIT_LIST_HEAD(&vm->idle);
......
...@@ -288,9 +288,12 @@ struct amdgpu_vm { ...@@ -288,9 +288,12 @@ struct amdgpu_vm {
/* Lock to protect vm_bo add/del/move on all lists of vm */ /* Lock to protect vm_bo add/del/move on all lists of vm */
spinlock_t status_lock; spinlock_t status_lock;
/* BOs who needs a validation */ /* Per-VM and PT BOs who needs a validation */
struct list_head evicted; struct list_head evicted;
/* BOs for user mode queues that need a validation */
struct list_head evicted_user;
/* PT BOs which relocated and their parent need an update */ /* PT BOs which relocated and their parent need an update */
struct list_head relocated; struct list_head relocated;
...@@ -434,9 +437,10 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, ...@@ -434,9 +437,10 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
unsigned int num_fences); unsigned int num_fences);
bool amdgpu_vm_ready(struct amdgpu_vm *vm); bool amdgpu_vm_ready(struct amdgpu_vm *vm);
uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm);
int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
int (*callback)(void *p, struct amdgpu_bo *bo), struct ww_acquire_ctx *ticket,
void *param); int (*callback)(void *p, struct amdgpu_bo *bo),
void *param);
int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync); int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync);
int amdgpu_vm_update_pdes(struct amdgpu_device *adev, int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
struct amdgpu_vm *vm, bool immediate); struct amdgpu_vm *vm, bool immediate);
......
...@@ -1515,9 +1515,9 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) ...@@ -1515,9 +1515,9 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr)
goto unreserve_out; goto unreserve_out;
} }
r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, r = amdgpu_vm_validate(pdd->dev->adev,
drm_priv_to_vm(pdd->drm_priv), drm_priv_to_vm(pdd->drm_priv), NULL,
svm_range_bo_validate, NULL); svm_range_bo_validate, NULL);
if (r) { if (r) {
pr_debug("failed %d validate pt bos\n", r); pr_debug("failed %d validate pt bos\n", r);
goto unreserve_out; goto unreserve_out;
...@@ -1641,7 +1641,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, ...@@ -1641,7 +1641,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
goto free_ctx; goto free_ctx;
} }
svm_range_reserve_bos(ctx, intr); r = svm_range_reserve_bos(ctx, intr);
if (r)
goto free_ctx;
p = container_of(prange->svms, struct kfd_process, svms); p = container_of(prange->svms, struct kfd_process, svms);
owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap, owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment