Commit 6e16d028 authored by Mika Kuoppala's avatar Mika Kuoppala Committed by Mika Kuoppala

drm/i915: Split up hangcheck phases

In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent b2251c08
...@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a) ...@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
return "idle"; return "idle";
case HANGCHECK_WAIT: case HANGCHECK_WAIT:
return "wait"; return "wait";
case HANGCHECK_ACTIVE: case HANGCHECK_ACTIVE_SEQNO:
return "active"; return "active seqno";
case HANGCHECK_ACTIVE_HEAD:
return "active head";
case HANGCHECK_ACTIVE_SUBUNITS:
return "active subunits";
case HANGCHECK_KICK: case HANGCHECK_KICK:
return "kick"; return "kick";
case HANGCHECK_HUNG: case HANGCHECK_HUNG:
......
...@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd) ...@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
memset(&engine->hangcheck.instdone, 0, memset(&engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone)); sizeof(engine->hangcheck.instdone));
return HANGCHECK_ACTIVE; return HANGCHECK_ACTIVE_HEAD;
} }
if (!subunits_stuck(engine)) if (!subunits_stuck(engine))
return HANGCHECK_ACTIVE; return HANGCHECK_ACTIVE_SUBUNITS;
return HANGCHECK_HUNG; return HANGCHECK_HUNG;
} }
...@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) ...@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
return HANGCHECK_HUNG; return HANGCHECK_HUNG;
} }
static void hangcheck_load_sample(struct intel_engine_cs *engine,
struct intel_engine_hangcheck *hc)
{
/* We don't strictly need an irq-barrier here, as we are not
* serving an interrupt request, be paranoid in case the
* barrier has side-effects (such as preventing a broken
* cacheline snoop) and so be sure that we can see the seqno
* advance. If the seqno should stick, due to a stale
* cacheline, we would erroneously declare the GPU hung.
*/
if (engine->irq_seqno_barrier)
engine->irq_seqno_barrier(engine);
hc->acthd = intel_engine_get_active_head(engine);
hc->seqno = intel_engine_get_seqno(engine);
hc->score = engine->hangcheck.score;
}
static void hangcheck_store_sample(struct intel_engine_cs *engine,
const struct intel_engine_hangcheck *hc)
{
engine->hangcheck.acthd = hc->acthd;
engine->hangcheck.seqno = hc->seqno;
engine->hangcheck.score = hc->score;
engine->hangcheck.action = hc->action;
}
static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
const struct intel_engine_hangcheck *hc)
{
if (engine->hangcheck.seqno != hc->seqno)
return HANGCHECK_ACTIVE_SEQNO;
if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
return HANGCHECK_IDLE;
return engine_stuck(engine, hc->acthd);
}
static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
struct intel_engine_hangcheck *hc)
{
hc->action = hangcheck_get_action(engine, hc);
switch (hc->action) {
case HANGCHECK_IDLE:
case HANGCHECK_WAIT:
break;
case HANGCHECK_ACTIVE_HEAD:
case HANGCHECK_ACTIVE_SUBUNITS:
/* We always increment the hangcheck score
* if the engine is busy and still processing
* the same request, so that no single request
* can run indefinitely (such as a chain of
* batches). The only time we do not increment
* the hangcheck score on this ring, if this
* engine is in a legitimate wait for another
* engine. In that case the waiting engine is a
* victim and we want to be sure we catch the
* right culprit. Then every time we do kick
* the ring, add a small increment to the
* score so that we can catch a batch that is
* being repeatedly kicked and so responsible
* for stalling the machine.
*/
hc->score += 1;
break;
case HANGCHECK_KICK:
hc->score += 5;
break;
case HANGCHECK_HUNG:
hc->score += 20;
break;
case HANGCHECK_ACTIVE_SEQNO:
/* Gradually reduce the count so that we catch DoS
* attempts across multiple batches.
*/
if (hc->score > 0)
hc->score -= 15;
if (hc->score < 0)
hc->score = 0;
/* Clear head and subunit states on seqno movement */
hc->acthd = 0;
memset(&engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone));
break;
default:
MISSING_CASE(hc->action);
}
}
static void hangcheck_declare_hang(struct drm_i915_private *i915,
unsigned int hung,
unsigned int stuck)
{
struct intel_engine_cs *engine;
char msg[80];
unsigned int tmp;
int len;
/* If some rings hung but others were still busy, only
* blame the hanging rings in the synopsis.
*/
if (stuck != hung)
hung &= ~stuck;
len = scnprintf(msg, sizeof(msg),
"%s on ", stuck == hung ? "No progress" : "Hang");
for_each_engine_masked(engine, i915, hung, tmp)
len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name);
msg[len-2] = '\0';
return i915_handle_error(i915, hung, msg);
}
/* /*
* This is called when the chip hasn't reported back with completed * This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. We keep track per ring seqno progress and * batchbuffers in a long time. We keep track per ring seqno progress and
...@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
enum intel_engine_id id; enum intel_engine_id id;
unsigned int hung = 0, stuck = 0; unsigned int hung = 0, stuck = 0;
int busy_count = 0; int busy_count = 0;
#define BUSY 1
#define KICK 5
#define HUNG 20
#define ACTIVE_DECAY 15
if (!i915.enable_hangcheck) if (!i915.enable_hangcheck)
return; return;
...@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv); intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) { for_each_engine(engine, dev_priv, id) {
bool busy = intel_engine_has_waiter(engine); struct intel_engine_hangcheck cur_state, *hc = &cur_state;
u64 acthd; const bool busy = intel_engine_has_waiter(engine);
u32 seqno;
u32 submit;
semaphore_clear_deadlocks(dev_priv); semaphore_clear_deadlocks(dev_priv);
/* We don't strictly need an irq-barrier here, as we are not hangcheck_load_sample(engine, hc);
* serving an interrupt request, be paranoid in case the hangcheck_accumulate_sample(engine, hc);
* barrier has side-effects (such as preventing a broken hangcheck_store_sample(engine, hc);
* cacheline snoop) and so be sure that we can see the seqno
* advance. If the seqno should stick, due to a stale if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
* cacheline, we would erroneously declare the GPU hung. hung |= intel_engine_flag(engine);
*/ if (hc->action != HANGCHECK_HUNG)
if (engine->irq_seqno_barrier) stuck |= intel_engine_flag(engine);
engine->irq_seqno_barrier(engine);
acthd = intel_engine_get_active_head(engine);
seqno = intel_engine_get_seqno(engine);
submit = intel_engine_last_submit(engine);
if (engine->hangcheck.seqno == seqno) {
if (i915_seqno_passed(seqno, submit)) {
engine->hangcheck.action = HANGCHECK_IDLE;
} else {
/* We always increment the hangcheck score
* if the engine is busy and still processing
* the same request, so that no single request
* can run indefinitely (such as a chain of
* batches). The only time we do not increment
* the hangcheck score on this ring, if this
* engine is in a legitimate wait for another
* engine. In that case the waiting engine is a
* victim and we want to be sure we catch the
* right culprit. Then every time we do kick
* the ring, add a small increment to the
* score so that we can catch a batch that is
* being repeatedly kicked and so responsible
* for stalling the machine.
*/
engine->hangcheck.action =
engine_stuck(engine, acthd);
switch (engine->hangcheck.action) {
case HANGCHECK_IDLE:
case HANGCHECK_WAIT:
break;
case HANGCHECK_ACTIVE:
engine->hangcheck.score += BUSY;
break;
case HANGCHECK_KICK:
engine->hangcheck.score += KICK;
break;
case HANGCHECK_HUNG:
engine->hangcheck.score += HUNG;
break;
}
}
if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
hung |= intel_engine_flag(engine);
if (engine->hangcheck.action != HANGCHECK_HUNG)
stuck |= intel_engine_flag(engine);
}
} else {
engine->hangcheck.action = HANGCHECK_ACTIVE;
/* Gradually reduce the count so that we catch DoS
* attempts across multiple batches.
*/
if (engine->hangcheck.score > 0)
engine->hangcheck.score -= ACTIVE_DECAY;
if (engine->hangcheck.score < 0)
engine->hangcheck.score = 0;
/* Clear head and subunit states on seqno movement */
acthd = 0;
memset(&engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone));
} }
engine->hangcheck.seqno = seqno;
engine->hangcheck.acthd = acthd;
busy_count += busy; busy_count += busy;
} }
if (hung) { if (hung)
char msg[80]; hangcheck_declare_hang(dev_priv, hung, stuck);
unsigned int tmp;
int len;
/* If some rings hung but others were still busy, only
* blame the hanging rings in the synopsis.
*/
if (stuck != hung)
hung &= ~stuck;
len = scnprintf(msg, sizeof(msg),
"%s on ", stuck == hung ? "No progress" : "Hang");
for_each_engine_masked(engine, dev_priv, hung, tmp)
len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name);
msg[len-2] = '\0';
return i915_handle_error(dev_priv, hung, msg);
}
/* Reset timer in case GPU hangs without another request being added */ /* Reset timer in case GPU hangs without another request being added */
if (busy_count) if (busy_count)
......
...@@ -67,7 +67,9 @@ struct intel_hw_status_page { ...@@ -67,7 +67,9 @@ struct intel_hw_status_page {
enum intel_engine_hangcheck_action { enum intel_engine_hangcheck_action {
HANGCHECK_IDLE = 0, HANGCHECK_IDLE = 0,
HANGCHECK_WAIT, HANGCHECK_WAIT,
HANGCHECK_ACTIVE, HANGCHECK_ACTIVE_SEQNO,
HANGCHECK_ACTIVE_HEAD,
HANGCHECK_ACTIVE_SUBUNITS,
HANGCHECK_KICK, HANGCHECK_KICK,
HANGCHECK_HUNG, HANGCHECK_HUNG,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment