Commit aab4707f authored by Chris Wilson's avatar Chris Wilson

drm/i915/gt: Harden the heartbeat against a stuck driver

If the driver gets stuck holding the kernel timeline, we cannot issue a
heartbeat and so fail to discover that the driver is indeed stuck and do
not issue a GPU reset (which would hopefully unstick the driver!).
Switch to using a trylock so that we can query if the heartbeat's
timeline mutex is locked elsewhere, and then use the timer to probe if it
remains stuck at the same spot for consecutive heartbeats, indicating
that the mutex has not been released and the engine has not progressed.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200702095219.963-1-chris@chris-wilson.co.uk
parent 680c45c7
...@@ -65,6 +65,7 @@ static void heartbeat(struct work_struct *wrk) ...@@ -65,6 +65,7 @@ static void heartbeat(struct work_struct *wrk)
container_of(wrk, typeof(*engine), heartbeat.work.work); container_of(wrk, typeof(*engine), heartbeat.work.work);
struct intel_context *ce = engine->kernel_context; struct intel_context *ce = engine->kernel_context;
struct i915_request *rq; struct i915_request *rq;
unsigned long serial;
/* Just in case everything has gone horribly wrong, give it a kick */ /* Just in case everything has gone horribly wrong, give it a kick */
intel_engine_flush_submission(engine); intel_engine_flush_submission(engine);
...@@ -122,10 +123,19 @@ static void heartbeat(struct work_struct *wrk) ...@@ -122,10 +123,19 @@ static void heartbeat(struct work_struct *wrk)
goto out; goto out;
} }
if (engine->wakeref_serial == engine->serial) serial = READ_ONCE(engine->serial);
if (engine->wakeref_serial == serial)
goto out; goto out;
mutex_lock(&ce->timeline->mutex); if (!mutex_trylock(&ce->timeline->mutex)) {
/* Unable to lock the kernel timeline, is the engine stuck? */
if (xchg(&engine->heartbeat.blocked, serial) == serial)
intel_gt_handle_error(engine->gt, engine->mask,
I915_ERROR_CAPTURE,
"no heartbeat on %s",
engine->name);
goto out;
}
intel_context_enter(ce); intel_context_enter(ce);
rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
......
...@@ -348,6 +348,7 @@ struct intel_engine_cs { ...@@ -348,6 +348,7 @@ struct intel_engine_cs {
struct { struct {
struct delayed_work work; struct delayed_work work;
struct i915_request *systole; struct i915_request *systole;
unsigned long blocked;
} heartbeat; } heartbeat;
unsigned long serial; unsigned long serial;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment