Commit d98c52cf authored by Chris Wilson's avatar Chris Wilson

drm/i915: Tighten reset_counter for reset status

In the reset_counter, we use two bits to track a GPU hang and reset. The
low bit is a "reset-in-progress" flag that we set to signal when we need
to break waiters in order for the recovery task to grab the mutex. As
soon as the recovery task has the mutex, we can clear that flag (which
we do by incrementing the reset_counter thereby incrementing the gobal
reset epoch). By clearing that flag when the recovery task holds the
struct_mutex, we can forgo a second flag that simply tells GEM to ignore
the "reset-in-progress" flag.

The second flag we store in the reset_counter is whether the
reset failed and we consider the GPU terminally wedged. Whilst this flag
is set, all access to the GPU (at least through GEM rather than direct mmio
access) is verboten.

PS: Fun is in store, as in the future we want to move from a global
reset epoch to a per-engine reset engine with request recovery.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1460565315-7748-6-git-send-email-chris@chris-wilson.co.uk
parent 7f1847eb
...@@ -4722,7 +4722,7 @@ i915_wedged_get(void *data, u64 *val) ...@@ -4722,7 +4722,7 @@ i915_wedged_get(void *data, u64 *val)
struct drm_device *dev = data; struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_private *dev_priv = dev->dev_private;
*val = i915_reset_counter(&dev_priv->gpu_error); *val = i915_terminally_wedged(&dev_priv->gpu_error);
return 0; return 0;
} }
...@@ -4741,7 +4741,7 @@ i915_wedged_set(void *data, u64 val) ...@@ -4741,7 +4741,7 @@ i915_wedged_set(void *data, u64 val)
* while it is writing to 'i915_wedged' * while it is writing to 'i915_wedged'
*/ */
if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) if (i915_reset_in_progress(&dev_priv->gpu_error))
return -EAGAIN; return -EAGAIN;
intel_runtime_pm_get(dev_priv); intel_runtime_pm_get(dev_priv);
......
...@@ -880,23 +880,32 @@ int i915_resume_switcheroo(struct drm_device *dev) ...@@ -880,23 +880,32 @@ int i915_resume_switcheroo(struct drm_device *dev)
int i915_reset(struct drm_device *dev) int i915_reset(struct drm_device *dev)
{ {
struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_private *dev_priv = dev->dev_private;
bool simulated; struct i915_gpu_error *error = &dev_priv->gpu_error;
unsigned reset_counter;
int ret; int ret;
intel_reset_gt_powersave(dev); intel_reset_gt_powersave(dev);
mutex_lock(&dev->struct_mutex); mutex_lock(&dev->struct_mutex);
i915_gem_reset(dev); /* Clear any previous failed attempts at recovery. Time to try again. */
atomic_andnot(I915_WEDGED, &error->reset_counter);
simulated = dev_priv->gpu_error.stop_rings != 0; /* Clear the reset-in-progress flag and increment the reset epoch. */
reset_counter = atomic_inc_return(&error->reset_counter);
if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
ret = -EIO;
goto error;
}
i915_gem_reset(dev);
ret = intel_gpu_reset(dev, ALL_ENGINES); ret = intel_gpu_reset(dev, ALL_ENGINES);
/* Also reset the gpu hangman. */ /* Also reset the gpu hangman. */
if (simulated) { if (error->stop_rings != 0) {
DRM_INFO("Simulated gpu hang, resetting stop_rings\n"); DRM_INFO("Simulated gpu hang, resetting stop_rings\n");
dev_priv->gpu_error.stop_rings = 0; error->stop_rings = 0;
if (ret == -ENODEV) { if (ret == -ENODEV) {
DRM_INFO("Reset not implemented, but ignoring " DRM_INFO("Reset not implemented, but ignoring "
"error for simulated gpu hangs\n"); "error for simulated gpu hangs\n");
...@@ -909,8 +918,7 @@ int i915_reset(struct drm_device *dev) ...@@ -909,8 +918,7 @@ int i915_reset(struct drm_device *dev)
if (ret) { if (ret) {
DRM_ERROR("Failed to reset chip: %i\n", ret); DRM_ERROR("Failed to reset chip: %i\n", ret);
mutex_unlock(&dev->struct_mutex); goto error;
return ret;
} }
intel_overlay_reset(dev_priv); intel_overlay_reset(dev_priv);
...@@ -929,20 +937,14 @@ int i915_reset(struct drm_device *dev) ...@@ -929,20 +937,14 @@ int i915_reset(struct drm_device *dev)
* was running at the time of the reset (i.e. we weren't VT * was running at the time of the reset (i.e. we weren't VT
* switched away). * switched away).
*/ */
/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
dev_priv->gpu_error.reload_in_reset = true;
ret = i915_gem_init_hw(dev); ret = i915_gem_init_hw(dev);
dev_priv->gpu_error.reload_in_reset = false;
mutex_unlock(&dev->struct_mutex);
if (ret) { if (ret) {
DRM_ERROR("Failed hw init on reset %d\n", ret); DRM_ERROR("Failed hw init on reset %d\n", ret);
return ret; goto error;
} }
mutex_unlock(&dev->struct_mutex);
/* /*
* rps/rc6 re-init is necessary to restore state lost after the * rps/rc6 re-init is necessary to restore state lost after the
* reset and the re-install of gt irqs. Skip for ironlake per * reset and the re-install of gt irqs. Skip for ironlake per
...@@ -953,6 +955,11 @@ int i915_reset(struct drm_device *dev) ...@@ -953,6 +955,11 @@ int i915_reset(struct drm_device *dev)
intel_enable_gt_powersave(dev); intel_enable_gt_powersave(dev);
return 0; return 0;
error:
atomic_or(I915_WEDGED, &error->reset_counter);
mutex_unlock(&dev->struct_mutex);
return ret;
} }
static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
......
...@@ -1400,9 +1400,6 @@ struct i915_gpu_error { ...@@ -1400,9 +1400,6 @@ struct i915_gpu_error {
/* For missed irq/seqno simulation. */ /* For missed irq/seqno simulation. */
unsigned int test_irq_rings; unsigned int test_irq_rings;
/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
bool reload_in_reset;
}; };
enum modeset_restore { enum modeset_restore {
......
...@@ -83,9 +83,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) ...@@ -83,9 +83,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
{ {
int ret; int ret;
#define EXIT_COND (!i915_reset_in_progress_or_wedged(error) || \ if (!i915_reset_in_progress(error))
i915_terminally_wedged(error))
if (EXIT_COND)
return 0; return 0;
/* /*
...@@ -94,17 +92,16 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) ...@@ -94,17 +92,16 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
* we should simply try to bail out and fail as gracefully as possible. * we should simply try to bail out and fail as gracefully as possible.
*/ */
ret = wait_event_interruptible_timeout(error->reset_queue, ret = wait_event_interruptible_timeout(error->reset_queue,
EXIT_COND, !i915_reset_in_progress(error),
10*HZ); 10*HZ);
if (ret == 0) { if (ret == 0) {
DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
return -EIO; return -EIO;
} else if (ret < 0) { } else if (ret < 0) {
return ret; return ret;
} } else {
#undef EXIT_COND
return 0; return 0;
}
} }
int i915_mutex_lock_interruptible(struct drm_device *dev) int i915_mutex_lock_interruptible(struct drm_device *dev)
...@@ -1113,21 +1110,15 @@ i915_gem_check_wedge(struct i915_gpu_error *error, ...@@ -1113,21 +1110,15 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
bool interruptible) bool interruptible)
{ {
if (i915_reset_in_progress_or_wedged(error)) { if (i915_reset_in_progress_or_wedged(error)) {
/* Recovery complete, but the reset failed ... */
if (i915_terminally_wedged(error))
return -EIO;
/* Non-interruptible callers can't handle -EAGAIN, hence return /* Non-interruptible callers can't handle -EAGAIN, hence return
* -EIO unconditionally for these. */ * -EIO unconditionally for these. */
if (!interruptible) if (!interruptible)
return -EIO; return -EIO;
/* Recovery complete, but the reset failed ... */
if (i915_terminally_wedged(error))
return -EIO;
/*
* Check if GPU Reset is in progress - we need intel_ring_begin
* to work properly to reinit the hw state while the gpu is
* still marked as reset-in-progress. Handle this with a flag.
*/
if (!error->reload_in_reset)
return -EAGAIN; return -EAGAIN;
} }
......
...@@ -2483,7 +2483,6 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv, ...@@ -2483,7 +2483,6 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
static void i915_reset_and_wakeup(struct drm_device *dev) static void i915_reset_and_wakeup(struct drm_device *dev)
{ {
struct drm_i915_private *dev_priv = to_i915(dev); struct drm_i915_private *dev_priv = to_i915(dev);
struct i915_gpu_error *error = &dev_priv->gpu_error;
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
...@@ -2501,7 +2500,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev) ...@@ -2501,7 +2500,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
* the reset in-progress bit is only ever set by code outside of this * the reset in-progress bit is only ever set by code outside of this
* work we don't need to worry about any other races. * work we don't need to worry about any other races.
*/ */
if (i915_reset_in_progress_or_wedged(error) && !i915_terminally_wedged(error)) { if (i915_reset_in_progress(&dev_priv->gpu_error)) {
DRM_DEBUG_DRIVER("resetting chip\n"); DRM_DEBUG_DRIVER("resetting chip\n");
kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
reset_event); reset_event);
...@@ -2529,25 +2528,9 @@ static void i915_reset_and_wakeup(struct drm_device *dev) ...@@ -2529,25 +2528,9 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
intel_runtime_pm_put(dev_priv); intel_runtime_pm_put(dev_priv);
if (ret == 0) { if (ret == 0)
/*
* After all the gem state is reset, increment the reset
* counter and wake up everyone waiting for the reset to
* complete.
*
* Since unlock operations are a one-sided barrier only,
* we need to insert a barrier here to order any seqno
* updates before
* the counter increment.
*/
smp_mb__before_atomic();
atomic_inc(&dev_priv->gpu_error.reset_counter);
kobject_uevent_env(&dev->primary->kdev->kobj, kobject_uevent_env(&dev->primary->kdev->kobj,
KOBJ_CHANGE, reset_done_event); KOBJ_CHANGE, reset_done_event);
} else {
atomic_or(I915_WEDGED, &error->reset_counter);
}
/* /*
* Note: The wake_up also serves as a memory barrier so that * Note: The wake_up also serves as a memory barrier so that
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment