Commit e8a3319c authored by John Harrison's avatar John Harrison

drm/i915: Allow error capture without a request

There was a report of error captures occurring without any hung
context being indicated despite the capture being initiated by a 'hung
context notification' from GuC. The problem was not reproducible.
However, it is possible to happen if the context in question has no
active requests. For example, if the hang was in the context switch
itself then the breadcrumb write would have occurred and the KMD would
see an idle context.

In the interests of attempting to provide as much information as
possible about a hang, it seems wise to include the engine info
regardless of whether a request was found or not. As opposed to just
prentending there was no hang at all.

So update the error capture code to always record engine information
if a context is given. Which means updating record_context() to take a
context instead of a request (which it only ever used to find the
context anyway). And split the request agnostic parts of
intel_engine_coredump_add_request() out into a seaprate function.

v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
pointer.
v3: Tidy up request locking code flow (Tvrtko)
v4: Pull in improved info message from next patch and fix up potential
leak of GuC register state (Daniele)
Signed-off-by: default avatarJohn Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> (v2)
Reviewed-by: default avatarDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Acked-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230127002842.3169194-5-John.C.Harrison@Intel.com
parent a4be3dca
......@@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
}
static bool record_context(struct i915_gem_context_coredump *e,
const struct i915_request *rq)
struct intel_context *ce)
{
struct i915_gem_context *ctx;
struct task_struct *task;
bool simulated;
rcu_read_lock();
ctx = rcu_dereference(rq->context->gem_context);
ctx = rcu_dereference(ce->gem_context);
if (ctx && !kref_get_unless_zero(&ctx->ref))
ctx = NULL;
rcu_read_unlock();
......@@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
e->guilty = atomic_read(&ctx->guilty_count);
e->active = atomic_read(&ctx->active_count);
e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
e->total_runtime = intel_context_get_total_runtime_ns(ce);
e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
simulated = i915_gem_context_no_error_capture(ctx);
......@@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
return ee;
}
static struct intel_engine_capture_vma *
engine_coredump_add_context(struct intel_engine_coredump *ee,
struct intel_context *ce,
gfp_t gfp)
{
struct intel_engine_capture_vma *vma = NULL;
ee->simulated |= record_context(&ee->context, ce);
if (ee->simulated)
return NULL;
/*
* We need to copy these to an anonymous buffer
* as the simplest method to avoid being overwritten
* by userspace.
*/
vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
vma = capture_vma(vma, ce->state, "HW context", gfp);
return vma;
}
struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
struct i915_request *rq,
gfp_t gfp)
{
struct intel_engine_capture_vma *vma = NULL;
struct intel_engine_capture_vma *vma;
ee->simulated |= record_context(&ee->context, rq);
if (ee->simulated)
vma = engine_coredump_add_context(ee, rq->context, gfp);
if (!vma)
return NULL;
/*
......@@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
*/
vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
vma = capture_user(vma, rq, gfp);
vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
vma = capture_vma(vma, rq->context->state, "HW context", gfp);
ee->rq_head = rq->head;
ee->rq_post = rq->postfix;
......@@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
return NULL;
intel_engine_get_hung_entity(engine, &ce, &rq);
if (!rq || !i915_request_started(rq))
goto no_request_capture;
if (rq && !i915_request_started(rq)) {
drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n",
engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id);
i915_request_put(rq);
rq = NULL;
}
capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
if (!capture)
goto no_request_capture;
if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
intel_guc_capture_get_matching_node(engine->gt, ee, ce);
if (rq) {
capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
i915_request_put(rq);
} else if (ce) {
capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
}
intel_engine_coredump_add_vma(ee, capture, compress);
i915_request_put(rq);
if (capture) {
intel_engine_coredump_add_vma(ee, capture, compress);
return ee;
if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
intel_guc_capture_get_matching_node(engine->gt, ee, ce);
} else {
kfree(ee);
ee = NULL;
}
no_request_capture:
if (rq)
i915_request_put(rq);
kfree(ee);
return NULL;
return ee;
}
static void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment