Commit b26a32a8 authored by Chris Wilson's avatar Chris Wilson Committed by Rodrigo Vivi

drm/i915: Always run hangcheck while the GPU is busy

Previously, we relied on only running the hangcheck while somebody was
waiting on the GPU, in order to minimise the amount of time hangcheck
had to run. (If nobody was watching the GPU, nobody would notice if the
GPU wasn't responding -- eventually somebody would care and so kick
hangcheck into action.) However, this falls apart from around commit
4680816b ("drm/i915: Wait first for submission, before waiting for
request completion"), as not all waiters declare themselves to hangcheck
and so we could switch off hangcheck and miss GPU hangs even when
waiting under the struct_mutex.

If we enable hangcheck from the first request submission, and let it run
until the GPU is idle again, we forgo all the complexity involved with
only enabling around waiters. We just have to remember to be careful that
we do not declare a GPU hang when idly waiting for the next request to
be come ready, as we will run hangcheck continuously even when the
engines are stalled waiting for external events. This should be true
already as we should only be tracking requests submitted to hardware for
execution as an indicator that the engine is busy.

Fixes: 4680816b ("drm/i915: Wait first for submission, before waiting for request completion"
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104840Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180129144104.3921-1-chris@chris-wilson.co.ukReviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
(cherry picked from commit 88923048)
Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent b5a756a7
...@@ -3323,16 +3323,15 @@ i915_gem_retire_work_handler(struct work_struct *work) ...@@ -3323,16 +3323,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
mutex_unlock(&dev->struct_mutex); mutex_unlock(&dev->struct_mutex);
} }
/* Keep the retire handler running until we are finally idle. /*
* Keep the retire handler running until we are finally idle.
* We do not need to do this test under locking as in the worst-case * We do not need to do this test under locking as in the worst-case
* we queue the retire worker once too often. * we queue the retire worker once too often.
*/ */
if (READ_ONCE(dev_priv->gt.awake)) { if (READ_ONCE(dev_priv->gt.awake))
i915_queue_hangcheck(dev_priv);
queue_delayed_work(dev_priv->wq, queue_delayed_work(dev_priv->wq,
&dev_priv->gt.retire_work, &dev_priv->gt.retire_work,
round_jiffies_up_relative(HZ)); round_jiffies_up_relative(HZ));
}
} }
static inline bool static inline bool
......
...@@ -276,6 +276,8 @@ static void mark_busy(struct drm_i915_private *i915) ...@@ -276,6 +276,8 @@ static void mark_busy(struct drm_i915_private *i915)
intel_engines_unpark(i915); intel_engines_unpark(i915);
i915_queue_hangcheck(i915);
queue_delayed_work(i915->wq, queue_delayed_work(i915->wq,
&i915->gt.retire_work, &i915->gt.retire_work,
round_jiffies_up_relative(HZ)); round_jiffies_up_relative(HZ));
......
...@@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t) ...@@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
return; return;
mod_timer(&b->fake_irq, jiffies + 1); mod_timer(&b->fake_irq, jiffies + 1);
/* Ensure that even if the GPU hangs, we get woken up.
*
* However, note that if no one is waiting, we never notice
* a gpu hang. Eventually, we will have to wait for a resource
* held by the GPU and so trigger a hangcheck. In the most
* pathological case, this will be upon memory starvation! To
* prevent this, we also queue the hangcheck from the retire
* worker.
*/
i915_queue_hangcheck(engine->i915);
} }
static void irq_enable(struct intel_engine_cs *engine) static void irq_enable(struct intel_engine_cs *engine)
......
...@@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
enum intel_engine_id id; enum intel_engine_id id;
unsigned int hung = 0, stuck = 0; unsigned int hung = 0, stuck = 0;
int busy_count = 0;
if (!i915_modparams.enable_hangcheck) if (!i915_modparams.enable_hangcheck)
return; return;
...@@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv); intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) { for_each_engine(engine, dev_priv, id) {
const bool busy = intel_engine_has_waiter(engine);
struct intel_engine_hangcheck hc; struct intel_engine_hangcheck hc;
semaphore_clear_deadlocks(dev_priv); semaphore_clear_deadlocks(dev_priv);
...@@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (hc.action != ENGINE_DEAD) if (hc.action != ENGINE_DEAD)
stuck |= intel_engine_flag(engine); stuck |= intel_engine_flag(engine);
} }
busy_count += busy;
} }
if (hung) if (hung)
hangcheck_declare_hang(dev_priv, hung, stuck); hangcheck_declare_hang(dev_priv, hung, stuck);
/* Reset timer in case GPU hangs without another request being added */ /* Reset timer in case GPU hangs without another request being added */
if (busy_count) i915_queue_hangcheck(dev_priv);
i915_queue_hangcheck(dev_priv);
} }
void intel_engine_init_hangcheck(struct intel_engine_cs *engine) void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment