Commit 36abacae authored by Christian König's avatar Christian König Committed by Dave Airlie

drm/radeon: rework gpu lockup detection and processing

Previusly multiple rings could trigger multiple GPU
resets at the same time.
Signed-off-by: default avatarChristian König <deathsimple@vodafone.de>
Reviewed-by: default avatarJerome Glisse <jglisse@redhat.com>
Signed-off-by: default avatarDave Airlie <airlied@redhat.com>
parent 7bd560e8
...@@ -255,8 +255,7 @@ struct radeon_fence_driver { ...@@ -255,8 +255,7 @@ struct radeon_fence_driver {
volatile uint32_t *cpu_addr; volatile uint32_t *cpu_addr;
atomic_t seq; atomic_t seq;
uint32_t last_seq; uint32_t last_seq;
unsigned long last_jiffies; unsigned long last_activity;
unsigned long last_timeout;
wait_queue_head_t queue; wait_queue_head_t queue;
struct list_head created; struct list_head created;
struct list_head emitted; struct list_head emitted;
......
...@@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) ...@@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence)
radeon_fence_ring_emit(rdev, fence->ring, fence); radeon_fence_ring_emit(rdev, fence->ring, fence);
trace_radeon_fence_emit(rdev->ddev, fence->seq); trace_radeon_fence_emit(rdev->ddev, fence->seq);
fence->emitted = true; fence->emitted = true;
/* are we the first fence on a previusly idle ring? */
if (list_empty(&rdev->fence_drv[fence->ring].emitted)) {
rdev->fence_drv[fence->ring].last_activity = jiffies;
}
list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted); list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted);
write_unlock_irqrestore(&rdev->fence_lock, irq_flags); write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
return 0; return 0;
...@@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring) ...@@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring)
struct list_head *i, *n; struct list_head *i, *n;
uint32_t seq; uint32_t seq;
bool wake = false; bool wake = false;
unsigned long cjiffies;
seq = radeon_fence_read(rdev, ring); seq = radeon_fence_read(rdev, ring);
if (seq != rdev->fence_drv[ring].last_seq) { if (seq == rdev->fence_drv[ring].last_seq)
rdev->fence_drv[ring].last_seq = seq;
rdev->fence_drv[ring].last_jiffies = jiffies;
rdev->fence_drv[ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
} else {
cjiffies = jiffies;
if (time_after(cjiffies, rdev->fence_drv[ring].last_jiffies)) {
cjiffies -= rdev->fence_drv[ring].last_jiffies;
if (time_after(rdev->fence_drv[ring].last_timeout, cjiffies)) {
/* update the timeout */
rdev->fence_drv[ring].last_timeout -= cjiffies;
} else {
/* the 500ms timeout is elapsed we should test
* for GPU lockup
*/
rdev->fence_drv[ring].last_timeout = 1;
}
} else {
/* wrap around update last jiffies, we will just wait
* a little longer
*/
rdev->fence_drv[ring].last_jiffies = cjiffies;
}
return false; return false;
}
rdev->fence_drv[ring].last_seq = seq;
rdev->fence_drv[ring].last_activity = jiffies;
n = NULL; n = NULL;
list_for_each(i, &rdev->fence_drv[ring].emitted) { list_for_each(i, &rdev->fence_drv[ring].emitted) {
fence = list_entry(i, struct radeon_fence, list); fence = list_entry(i, struct radeon_fence, list);
...@@ -207,66 +191,84 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) ...@@ -207,66 +191,84 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
struct radeon_device *rdev; struct radeon_device *rdev;
unsigned long irq_flags, timeout; unsigned long irq_flags, timeout;
u32 seq; u32 seq;
int r; int i, r;
bool signaled;
if (fence == NULL) { if (fence == NULL) {
WARN(1, "Querying an invalid fence : %p !\n", fence); WARN(1, "Querying an invalid fence : %p !\n", fence);
return 0; return -EINVAL;
} }
rdev = fence->rdev; rdev = fence->rdev;
if (radeon_fence_signaled(fence)) { signaled = radeon_fence_signaled(fence);
return 0; while (!signaled) {
} read_lock_irqsave(&rdev->fence_lock, irq_flags);
timeout = rdev->fence_drv[fence->ring].last_timeout; timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
retry: if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) {
/* save current sequence used to check for GPU lockup */ /* the normal case, timeout is somewhere before last_activity */
seq = rdev->fence_drv[fence->ring].last_seq; timeout = rdev->fence_drv[fence->ring].last_activity - timeout;
trace_radeon_fence_wait_begin(rdev->ddev, seq); } else {
if (intr) { /* either jiffies wrapped around, or no fence was signaled in the last 500ms
* anyway we will just wait for the minimum amount and then check for a lockup */
timeout = 1;
}
/* save current sequence value used to check for GPU lockups */
seq = rdev->fence_drv[fence->ring].last_seq;
read_unlock_irqrestore(&rdev->fence_lock, irq_flags);
trace_radeon_fence_wait_begin(rdev->ddev, seq);
radeon_irq_kms_sw_irq_get(rdev, fence->ring); radeon_irq_kms_sw_irq_get(rdev, fence->ring);
r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue, if (intr) {
radeon_fence_signaled(fence), timeout); r = wait_event_interruptible_timeout(
rdev->fence_drv[fence->ring].queue,
(signaled = radeon_fence_signaled(fence)), timeout);
} else {
r = wait_event_timeout(
rdev->fence_drv[fence->ring].queue,
(signaled = radeon_fence_signaled(fence)), timeout);
}
radeon_irq_kms_sw_irq_put(rdev, fence->ring); radeon_irq_kms_sw_irq_put(rdev, fence->ring);
if (unlikely(r < 0)) { if (unlikely(r < 0)) {
return r; return r;
} }
} else { trace_radeon_fence_wait_end(rdev->ddev, seq);
radeon_irq_kms_sw_irq_get(rdev, fence->ring);
r = wait_event_timeout(rdev->fence_drv[fence->ring].queue, if (unlikely(!signaled)) {
radeon_fence_signaled(fence), timeout); /* we were interrupted for some reason and fence
radeon_irq_kms_sw_irq_put(rdev, fence->ring); * isn't signaled yet, resume waiting */
} if (r) {
trace_radeon_fence_wait_end(rdev->ddev, seq); continue;
if (unlikely(!radeon_fence_signaled(fence))) { }
/* we were interrupted for some reason and fence isn't
* isn't signaled yet, resume wait write_lock_irqsave(&rdev->fence_lock, irq_flags);
*/ /* check if sequence value has changed since last_activity */
if (r) { if (seq != rdev->fence_drv[fence->ring].last_seq) {
timeout = r; write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
goto retry; continue;
} }
/* don't protect read access to rdev->fence_drv[t].last_seq
* if we experiencing a lockup the value doesn't change /* change sequence value on all rings, so nobody else things there is a lockup */
*/ for (i = 0; i < RADEON_NUM_RINGS; ++i)
if (seq == rdev->fence_drv[fence->ring].last_seq && rdev->fence_drv[i].last_seq -= 0x10000;
radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) { write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
/* good news we believe it's a lockup */ if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
fence->seq, seq); /* good news we believe it's a lockup */
printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
/* mark the ring as not ready any more */ fence->seq, seq);
rdev->ring[fence->ring].ready = false;
r = radeon_gpu_reset(rdev); /* mark the ring as not ready any more */
if (r) rdev->ring[fence->ring].ready = false;
return r; r = radeon_gpu_reset(rdev);
if (r)
return r;
write_lock_irqsave(&rdev->fence_lock, irq_flags);
rdev->fence_drv[fence->ring].last_activity = jiffies;
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
}
} }
timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
write_lock_irqsave(&rdev->fence_lock, irq_flags);
rdev->fence_drv[fence->ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
rdev->fence_drv[fence->ring].last_jiffies = jiffies;
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
goto retry;
} }
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment