Commit 9d858b69 authored by Matthew Brost's avatar Matthew Brost Committed by Rodrigo Vivi

drm/xe: Ban a VM if rebind worker hits an error

We cannot recover a VM if a rebind worker hits an error, ban the VM if
happens to ensure we do not attempt to place this VM on the hardware
again.

A follow up will inform the user if this happens.

v2: Return -ECANCELED in exec VM closed or banned, check for closed or
banned within VM lock.
v3: Fix lockdep splat by looking engine outside of vm->lock
v4: Fix error path when engine lookup fails
v5: Add debug message in rebind worker on error, update comments wrt
locking, add xe_vm_close helper
Reviewed-by: default avatarThomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent 54c9fb7e
...@@ -597,10 +597,23 @@ int xe_engine_create_ioctl(struct drm_device *dev, void *data, ...@@ -597,10 +597,23 @@ int xe_engine_create_ioctl(struct drm_device *dev, void *data,
if (XE_IOCTL_ERR(xe, !vm)) if (XE_IOCTL_ERR(xe, !vm))
return -ENOENT; return -ENOENT;
err = down_read_interruptible(&vm->lock);
if (err) {
xe_vm_put(vm);
return err;
}
if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
up_read(&vm->lock);
xe_vm_put(vm);
return -ENOENT;
}
e = xe_engine_create(xe, vm, logical_mask, e = xe_engine_create(xe, vm, logical_mask,
args->width, hwe, args->width, hwe,
xe_vm_no_dma_fences(vm) ? 0 : xe_vm_no_dma_fences(vm) ? 0 :
ENGINE_FLAG_PERSISTENT); ENGINE_FLAG_PERSISTENT);
up_read(&vm->lock);
xe_vm_put(vm); xe_vm_put(vm);
if (IS_ERR(e)) if (IS_ERR(e))
return PTR_ERR(e); return PTR_ERR(e);
......
...@@ -297,9 +297,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -297,9 +297,9 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (err) if (err)
goto err_unlock_list; goto err_unlock_list;
if (xe_vm_is_closed(engine->vm)) { if (xe_vm_is_closed_or_banned(engine->vm)) {
drm_warn(&xe->drm, "Trying to schedule after vm is closed\n"); drm_warn(&xe->drm, "Trying to schedule after vm is closed or banned\n");
err = -EIO; err = -ECANCELED;
goto err_engine_end; goto err_engine_end;
} }
......
...@@ -478,6 +478,11 @@ DECLARE_EVENT_CLASS(xe_vm, ...@@ -478,6 +478,11 @@ DECLARE_EVENT_CLASS(xe_vm,
__entry->asid) __entry->asid)
); );
DEFINE_EVENT(xe_vm, xe_vm_kill,
TP_PROTO(struct xe_vm *vm),
TP_ARGS(vm)
);
DEFINE_EVENT(xe_vm, xe_vm_create, DEFINE_EVENT(xe_vm, xe_vm_create,
TP_PROTO(struct xe_vm *vm), TP_PROTO(struct xe_vm *vm),
TP_ARGS(vm) TP_ARGS(vm)
......
...@@ -514,6 +514,24 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm, ...@@ -514,6 +514,24 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
static void xe_vm_kill(struct xe_vm *vm)
{
struct ww_acquire_ctx ww;
struct xe_engine *e;
lockdep_assert_held(&vm->lock);
xe_vm_lock(vm, &ww, 0, false);
vm->flags |= XE_VM_FLAG_BANNED;
trace_xe_vm_kill(vm);
list_for_each_entry(e, &vm->preempt.engines, compute.link)
e->ops->kill(e);
xe_vm_unlock(vm, &ww);
/* TODO: Inform user the VM is banned */
}
static void preempt_rebind_work_func(struct work_struct *w) static void preempt_rebind_work_func(struct work_struct *w)
{ {
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
...@@ -533,13 +551,14 @@ static void preempt_rebind_work_func(struct work_struct *w) ...@@ -533,13 +551,14 @@ static void preempt_rebind_work_func(struct work_struct *w)
XE_BUG_ON(!xe_vm_in_compute_mode(vm)); XE_BUG_ON(!xe_vm_in_compute_mode(vm));
trace_xe_vm_rebind_worker_enter(vm); trace_xe_vm_rebind_worker_enter(vm);
if (xe_vm_is_closed(vm)) { down_write(&vm->lock);
if (xe_vm_is_closed_or_banned(vm)) {
up_write(&vm->lock);
trace_xe_vm_rebind_worker_exit(vm); trace_xe_vm_rebind_worker_exit(vm);
return; return;
} }
down_write(&vm->lock);
retry: retry:
if (vm->async_ops.error) if (vm->async_ops.error)
goto out_unlock_outer; goto out_unlock_outer;
...@@ -666,11 +685,14 @@ static void preempt_rebind_work_func(struct work_struct *w) ...@@ -666,11 +685,14 @@ static void preempt_rebind_work_func(struct work_struct *w)
goto retry; goto retry;
} }
} }
if (err) {
drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
xe_vm_kill(vm);
}
up_write(&vm->lock); up_write(&vm->lock);
free_preempt_fences(&preempt_fences); free_preempt_fences(&preempt_fences);
XE_WARN_ON(err < 0); /* TODO: Kill VM or put in error state */
trace_xe_vm_rebind_worker_exit(vm); trace_xe_vm_rebind_worker_exit(vm);
} }
...@@ -1140,11 +1162,12 @@ xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma) ...@@ -1140,11 +1162,12 @@ xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma)
{ {
struct rb_node *node; struct rb_node *node;
if (xe_vm_is_closed(vm)) lockdep_assert_held(&vm->lock);
if (xe_vm_is_closed_or_banned(vm))
return NULL; return NULL;
XE_BUG_ON(vma->end >= vm->size); XE_BUG_ON(vma->end >= vm->size);
lockdep_assert_held(&vm->lock);
node = rb_find(vma, &vm->vmas, xe_vma_cmp_vma_cb); node = rb_find(vma, &vm->vmas, xe_vma_cmp_vma_cb);
...@@ -1377,6 +1400,13 @@ static void vm_error_capture(struct xe_vm *vm, int err, ...@@ -1377,6 +1400,13 @@ static void vm_error_capture(struct xe_vm *vm, int err,
wake_up_all(&vm->async_ops.error_capture.wq); wake_up_all(&vm->async_ops.error_capture.wq);
} }
static void xe_vm_close(struct xe_vm *vm)
{
down_write(&vm->lock);
vm->size = 0;
up_write(&vm->lock);
}
void xe_vm_close_and_put(struct xe_vm *vm) void xe_vm_close_and_put(struct xe_vm *vm)
{ {
struct rb_root contested = RB_ROOT; struct rb_root contested = RB_ROOT;
...@@ -1387,8 +1417,8 @@ void xe_vm_close_and_put(struct xe_vm *vm) ...@@ -1387,8 +1417,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)
XE_BUG_ON(vm->preempt.num_engines); XE_BUG_ON(vm->preempt.num_engines);
vm->size = 0; xe_vm_close(vm);
smp_mb();
flush_async_ops(vm); flush_async_ops(vm);
if (xe_vm_in_compute_mode(vm)) if (xe_vm_in_compute_mode(vm))
flush_work(&vm->preempt.rebind_work); flush_work(&vm->preempt.rebind_work);
...@@ -3072,30 +3102,34 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3072,30 +3102,34 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (err) if (err)
return err; return err;
vm = xe_vm_lookup(xef, args->vm_id);
if (XE_IOCTL_ERR(xe, !vm)) {
err = -EINVAL;
goto free_objs;
}
if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
drm_err(dev, "VM closed while we began looking up?\n");
err = -ENOENT;
goto put_vm;
}
if (args->engine_id) { if (args->engine_id) {
e = xe_engine_lookup(xef, args->engine_id); e = xe_engine_lookup(xef, args->engine_id);
if (XE_IOCTL_ERR(xe, !e)) { if (XE_IOCTL_ERR(xe, !e)) {
err = -ENOENT; err = -ENOENT;
goto put_vm; goto free_objs;
} }
if (XE_IOCTL_ERR(xe, !(e->flags & ENGINE_FLAG_VM))) { if (XE_IOCTL_ERR(xe, !(e->flags & ENGINE_FLAG_VM))) {
err = -EINVAL; err = -EINVAL;
goto put_engine; goto put_engine;
} }
} }
vm = xe_vm_lookup(xef, args->vm_id);
if (XE_IOCTL_ERR(xe, !vm)) {
err = -EINVAL;
goto put_engine;
}
err = down_write_killable(&vm->lock);
if (err)
goto put_vm;
if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
err = -ENOENT;
goto release_vm_lock;
}
if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) { if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
if (XE_IOCTL_ERR(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) if (XE_IOCTL_ERR(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
...@@ -3105,10 +3139,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3105,10 +3139,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
err = -EPROTO; err = -EPROTO;
if (!err) { if (!err) {
down_write(&vm->lock);
trace_xe_vm_restart(vm); trace_xe_vm_restart(vm);
vm_set_async_error(vm, 0); vm_set_async_error(vm, 0);
up_write(&vm->lock);
queue_work(system_unbound_wq, &vm->async_ops.work); queue_work(system_unbound_wq, &vm->async_ops.work);
...@@ -3117,13 +3149,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3117,13 +3149,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
xe_vm_queue_rebind_worker(vm); xe_vm_queue_rebind_worker(vm);
} }
goto put_engine; goto release_vm_lock;
} }
if (XE_IOCTL_ERR(xe, !vm->async_ops.error && if (XE_IOCTL_ERR(xe, !vm->async_ops.error &&
async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) { async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
goto put_engine; goto release_vm_lock;
} }
for (i = 0; i < args->num_binds; ++i) { for (i = 0; i < args->num_binds; ++i) {
...@@ -3133,7 +3165,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3133,7 +3165,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (XE_IOCTL_ERR(xe, range > vm->size) || if (XE_IOCTL_ERR(xe, range > vm->size) ||
XE_IOCTL_ERR(xe, addr > vm->size - range)) { XE_IOCTL_ERR(xe, addr > vm->size - range)) {
err = -EINVAL; err = -EINVAL;
goto put_engine; goto release_vm_lock;
} }
if (bind_ops[i].tile_mask) { if (bind_ops[i].tile_mask) {
...@@ -3142,7 +3174,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3142,7 +3174,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (XE_IOCTL_ERR(xe, bind_ops[i].tile_mask & if (XE_IOCTL_ERR(xe, bind_ops[i].tile_mask &
~valid_tiles)) { ~valid_tiles)) {
err = -EINVAL; err = -EINVAL;
goto put_engine; goto release_vm_lock;
} }
} }
} }
...@@ -3150,13 +3182,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3150,13 +3182,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL); bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
if (!bos) { if (!bos) {
err = -ENOMEM; err = -ENOMEM;
goto put_engine; goto release_vm_lock;
} }
vmas = kzalloc(sizeof(*vmas) * args->num_binds, GFP_KERNEL); vmas = kzalloc(sizeof(*vmas) * args->num_binds, GFP_KERNEL);
if (!vmas) { if (!vmas) {
err = -ENOMEM; err = -ENOMEM;
goto put_engine; goto release_vm_lock;
} }
for (i = 0; i < args->num_binds; ++i) { for (i = 0; i < args->num_binds; ++i) {
...@@ -3211,10 +3243,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3211,10 +3243,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
goto free_syncs; goto free_syncs;
} }
err = down_write_killable(&vm->lock);
if (err)
goto free_syncs;
/* Do some error checking first to make the unwind easier */ /* Do some error checking first to make the unwind easier */
for (i = 0; i < args->num_binds; ++i) { for (i = 0; i < args->num_binds; ++i) {
u64 range = bind_ops[i].range; u64 range = bind_ops[i].range;
...@@ -3223,7 +3251,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3223,7 +3251,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
err = __vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op); err = __vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
if (err) if (err)
goto release_vm_lock; goto free_syncs;
} }
for (i = 0; i < args->num_binds; ++i) { for (i = 0; i < args->num_binds; ++i) {
...@@ -3343,8 +3371,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3343,8 +3371,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
break; break;
} }
} }
release_vm_lock:
up_write(&vm->lock);
free_syncs: free_syncs:
while (num_syncs--) { while (num_syncs--) {
if (async && j && if (async && j &&
...@@ -3357,11 +3383,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -3357,11 +3383,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
put_obj: put_obj:
for (i = j; i < args->num_binds; ++i) for (i = j; i < args->num_binds; ++i)
xe_bo_put(bos[i]); xe_bo_put(bos[i]);
release_vm_lock:
up_write(&vm->lock);
put_vm:
xe_vm_put(vm);
put_engine: put_engine:
if (e) if (e)
xe_engine_put(e); xe_engine_put(e);
put_vm:
xe_vm_put(vm);
free_objs: free_objs:
kfree(bos); kfree(bos);
kfree(vmas); kfree(vmas);
......
...@@ -45,10 +45,21 @@ void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww); ...@@ -45,10 +45,21 @@ void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww);
static inline bool xe_vm_is_closed(struct xe_vm *vm) static inline bool xe_vm_is_closed(struct xe_vm *vm)
{ {
/* Only guaranteed not to change when vm->resv is held */ /* Only guaranteed not to change when vm->lock is held */
return !vm->size; return !vm->size;
} }
static inline bool xe_vm_is_banned(struct xe_vm *vm)
{
return vm->flags & XE_VM_FLAG_BANNED;
}
static inline bool xe_vm_is_closed_or_banned(struct xe_vm *vm)
{
lockdep_assert_held(&vm->lock);
return xe_vm_is_closed(vm) || xe_vm_is_banned(vm);
}
struct xe_vma * struct xe_vma *
xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma); xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma);
......
...@@ -313,7 +313,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, ...@@ -313,7 +313,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
if (XE_IOCTL_ERR(xe, !vm)) if (XE_IOCTL_ERR(xe, !vm))
return -EINVAL; return -EINVAL;
if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) { if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
err = -ENOENT; err = -ENOENT;
goto put_vm; goto put_vm;
} }
......
...@@ -176,15 +176,19 @@ struct xe_vm { ...@@ -176,15 +176,19 @@ struct xe_vm {
struct xe_bo *scratch_bo[XE_MAX_TILES_PER_DEVICE]; struct xe_bo *scratch_bo[XE_MAX_TILES_PER_DEVICE];
struct xe_pt *scratch_pt[XE_MAX_TILES_PER_DEVICE][XE_VM_MAX_LEVEL]; struct xe_pt *scratch_pt[XE_MAX_TILES_PER_DEVICE][XE_VM_MAX_LEVEL];
/** @flags: flags for this VM, statically setup a creation time */ /**
* @flags: flags for this VM, statically setup a creation time aside
* from XE_VM_FLAG_BANNED which requires vm->lock to set / read safely
*/
#define XE_VM_FLAGS_64K BIT(0) #define XE_VM_FLAGS_64K BIT(0)
#define XE_VM_FLAG_COMPUTE_MODE BIT(1) #define XE_VM_FLAG_COMPUTE_MODE BIT(1)
#define XE_VM_FLAG_ASYNC_BIND_OPS BIT(2) #define XE_VM_FLAG_ASYNC_BIND_OPS BIT(2)
#define XE_VM_FLAG_MIGRATION BIT(3) #define XE_VM_FLAG_MIGRATION BIT(3)
#define XE_VM_FLAG_SCRATCH_PAGE BIT(4) #define XE_VM_FLAG_SCRATCH_PAGE BIT(4)
#define XE_VM_FLAG_FAULT_MODE BIT(5) #define XE_VM_FLAG_FAULT_MODE BIT(5)
#define XE_VM_FLAG_GT_ID(flags) (((flags) >> 6) & 0x3) #define XE_VM_FLAG_BANNED BIT(6)
#define XE_VM_FLAG_SET_TILE_ID(tile) ((tile)->id << 6) #define XE_VM_FLAG_GT_ID(flags) (((flags) >> 7) & 0x3)
#define XE_VM_FLAG_SET_TILE_ID(tile) ((tile)->id << 7)
unsigned long flags; unsigned long flags;
/** @composite_fence_ctx: context composite fence */ /** @composite_fence_ctx: context composite fence */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment