Commit 094f9a54 authored by Chris Wilson's avatar Chris Wilson Committed by Daniel Vetter

drm/i915: Fix __wait_seqno to use true infinite timeouts

When we switched to always using a timeout in conjunction with
wait_seqno, we lost the ability to detect missed interrupts. Since, we
have had issues with interrupts on a number of generations, and they are
required to be delivered in a timely fashion for a smooth UX, it is
important that we do log errors found in the wild and prevent the
display stalling for upwards of 1s every time the seqno interrupt is
missed.

Rather than continue to fix up the timeouts to work around the interface
impedence in wait_event_*(), open code the combination of
wait_event[_interruptible][_timeout], and use the exposed timer to
poll for seqno should we detect a lost interrupt.

v2: In order to satisfy the debug requirement of logging missed
interrupts with the real world requirments of making machines work even
if interrupts are hosed, we revert to polling after detecting a missed
interrupt.

v3: Throw in a debugfs interface to simulate broken hw not reporting
interrupts.

v4: s/EGAIN/EAGAIN/ (Imre)
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarImre Deak <imre.deak@intel.com>
[danvet: Don't use the struct typedef in new code.]
Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
parent cbb47d17
...@@ -1897,6 +1897,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops, ...@@ -1897,6 +1897,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
i915_ring_stop_get, i915_ring_stop_set, i915_ring_stop_get, i915_ring_stop_set,
"0x%08llx\n"); "0x%08llx\n");
static int
i915_ring_missed_irq_get(void *data, u64 *val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
*val = dev_priv->gpu_error.missed_irq_rings;
return 0;
}
static int
i915_ring_missed_irq_set(void *data, u64 val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
int ret;
/* Lock against concurrent debugfs callers */
ret = mutex_lock_interruptible(&dev->struct_mutex);
if (ret)
return ret;
dev_priv->gpu_error.missed_irq_rings = val;
mutex_unlock(&dev->struct_mutex);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
i915_ring_missed_irq_get, i915_ring_missed_irq_set,
"0x%08llx\n");
static int
i915_ring_test_irq_get(void *data, u64 *val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
*val = dev_priv->gpu_error.test_irq_rings;
return 0;
}
static int
i915_ring_test_irq_set(void *data, u64 val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
int ret;
DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
/* Lock against concurrent debugfs callers */
ret = mutex_lock_interruptible(&dev->struct_mutex);
if (ret)
return ret;
dev_priv->gpu_error.test_irq_rings = val;
mutex_unlock(&dev->struct_mutex);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
i915_ring_test_irq_get, i915_ring_test_irq_set,
"0x%08llx\n");
#define DROP_UNBOUND 0x1 #define DROP_UNBOUND 0x1
#define DROP_BOUND 0x2 #define DROP_BOUND 0x2
#define DROP_RETIRE 0x4 #define DROP_RETIRE 0x4
...@@ -2290,6 +2356,8 @@ static struct i915_debugfs_files { ...@@ -2290,6 +2356,8 @@ static struct i915_debugfs_files {
{"i915_min_freq", &i915_min_freq_fops}, {"i915_min_freq", &i915_min_freq_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops},
{"i915_ring_stop", &i915_ring_stop_fops}, {"i915_ring_stop", &i915_ring_stop_fops},
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
{"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops},
{"i915_error_state", &i915_error_state_fops}, {"i915_error_state", &i915_error_state_fops},
{"i915_next_seqno", &i915_next_seqno_fops}, {"i915_next_seqno", &i915_next_seqno_fops},
......
...@@ -1013,6 +1013,9 @@ struct i915_gpu_error { ...@@ -1013,6 +1013,9 @@ struct i915_gpu_error {
struct drm_i915_error_state *first_error; struct drm_i915_error_state *first_error;
struct work_struct work; struct work_struct work;
unsigned long missed_irq_rings;
/** /**
* State variable and reset counter controlling the reset flow * State variable and reset counter controlling the reset flow
* *
...@@ -1051,6 +1054,9 @@ struct i915_gpu_error { ...@@ -1051,6 +1054,9 @@ struct i915_gpu_error {
/* For gpu hang simulation. */ /* For gpu hang simulation. */
unsigned int stop_rings; unsigned int stop_rings;
/* For missed irq/seqno simulation. */
unsigned int test_irq_rings;
}; };
enum modeset_restore { enum modeset_restore {
......
...@@ -971,6 +971,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) ...@@ -971,6 +971,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
return ret; return ret;
} }
static void fake_irq(unsigned long data)
{
wake_up_process((struct task_struct *)data);
}
static bool missed_irq(struct drm_i915_private *dev_priv,
struct intel_ring_buffer *ring)
{
return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
}
/** /**
* __wait_seqno - wait until execution of seqno has finished * __wait_seqno - wait until execution of seqno has finished
* @ring: the ring expected to report seqno * @ring: the ring expected to report seqno
...@@ -994,10 +1005,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, ...@@ -994,10 +1005,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
bool interruptible, struct timespec *timeout) bool interruptible, struct timespec *timeout)
{ {
drm_i915_private_t *dev_priv = ring->dev->dev_private; drm_i915_private_t *dev_priv = ring->dev->dev_private;
struct timespec before, now, wait_time={1,0}; struct timespec before, now;
unsigned long timeout_jiffies; DEFINE_WAIT(wait);
long end; long timeout_jiffies;
bool wait_forever = true;
int ret; int ret;
WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
...@@ -1005,51 +1015,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, ...@@ -1005,51 +1015,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
return 0; return 0;
trace_i915_gem_request_wait_begin(ring, seqno); timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
if (timeout != NULL) {
wait_time = *timeout;
wait_forever = false;
}
timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
if (WARN_ON(!ring->irq_get(ring))) if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
WARN_ON(!ring->irq_get(ring)))
return -ENODEV; return -ENODEV;
/* Record current time in case interrupted by signal, or wedged * */ /* Record current time in case interrupted by signal, or wedged */
trace_i915_gem_request_wait_begin(ring, seqno);
getrawmonotonic(&before); getrawmonotonic(&before);
for (;;) {
struct timer_list timer;
unsigned long expire;
#define EXIT_COND \ prepare_to_wait(&ring->irq_queue, &wait,
(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
i915_reset_in_progress(&dev_priv->gpu_error) || \
reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
do {
if (interruptible)
end = wait_event_interruptible_timeout(ring->irq_queue,
EXIT_COND,
timeout_jiffies);
else
end = wait_event_timeout(ring->irq_queue, EXIT_COND,
timeout_jiffies);
/* We need to check whether any gpu reset happened in between /* We need to check whether any gpu reset happened in between
* the caller grabbing the seqno and now ... */ * the caller grabbing the seqno and now ... */
if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
end = -EAGAIN; /* ... but upgrade the -EAGAIN to an -EIO if the gpu
* is truely gone. */
ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
if (ret == 0)
ret = -EAGAIN;
break;
}
/* ... but upgrade the -EGAIN to an -EIO if the gpu is truely if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
* gone. */ ret = 0;
ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); break;
if (ret) }
end = ret;
} while (end == 0 && wait_forever); if (interruptible && signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
if (timeout_jiffies <= 0) {
ret = -ETIME;
break;
}
timer.function = NULL;
if (timeout || missed_irq(dev_priv, ring)) {
setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
mod_timer(&timer, expire);
}
schedule();
if (timeout)
timeout_jiffies = expire - jiffies;
if (timer.function) {
del_singleshot_timer_sync(&timer);
destroy_timer_on_stack(&timer);
}
}
getrawmonotonic(&now); getrawmonotonic(&now);
trace_i915_gem_request_wait_end(ring, seqno);
ring->irq_put(ring); ring->irq_put(ring);
trace_i915_gem_request_wait_end(ring, seqno);
#undef EXIT_COND finish_wait(&ring->irq_queue, &wait);
if (timeout) { if (timeout) {
struct timespec sleep_time = timespec_sub(now, before); struct timespec sleep_time = timespec_sub(now, before);
...@@ -1058,17 +1088,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, ...@@ -1058,17 +1088,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
set_normalized_timespec(timeout, 0, 0); set_normalized_timespec(timeout, 0, 0);
} }
switch (end) { return ret;
case -EIO:
case -EAGAIN: /* Wedged */
case -ERESTARTSYS: /* Signal */
return (int)end;
case 0: /* Timeout */
return -ETIME;
default: /* Completed */
WARN_ON(end < 0); /* We're not aware of other errors */
return 0;
}
} }
/** /**
......
...@@ -311,6 +311,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -311,6 +311,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
err_printf(m, "CCID: 0x%08x\n", error->ccid); err_printf(m, "CCID: 0x%08x\n", error->ccid);
err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
for (i = 0; i < dev_priv->num_fence_regs; i++) for (i = 0; i < dev_priv->num_fence_regs; i++)
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
......
...@@ -2039,10 +2039,13 @@ static void i915_hangcheck_elapsed(unsigned long data) ...@@ -2039,10 +2039,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
if (waitqueue_active(&ring->irq_queue)) { if (waitqueue_active(&ring->irq_queue)) {
/* Issue a wake-up to catch stuck h/w. */ /* Issue a wake-up to catch stuck h/w. */
DRM_ERROR("Hangcheck timer elapsed... %s idle\n", if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
ring->name); DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
wake_up_all(&ring->irq_queue); ring->name);
ring->hangcheck.score += HUNG; wake_up_all(&ring->irq_queue);
}
/* Safeguard against driver failure */
ring->hangcheck.score += BUSY;
} else } else
busy = false; busy = false;
} else { } else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment