Commit 0bfa4b41 authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/radeon: handle lockup in delayed work, v5

v5 (chk): complete rework, start when the first fence is emitted,
          stop when the last fence is signalled, make it work
          correctly with GPU resets, cleanup radeon_fence_wait_seq
Signed-off-by: default avatarMaarten Lankhorst <maarten.lankhorst@canonical.com>
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9bb39ff4
...@@ -350,6 +350,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw, ...@@ -350,6 +350,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw,
* Fences. * Fences.
*/ */
struct radeon_fence_driver { struct radeon_fence_driver {
struct radeon_device *rdev;
uint32_t scratch_reg; uint32_t scratch_reg;
uint64_t gpu_addr; uint64_t gpu_addr;
volatile uint32_t *cpu_addr; volatile uint32_t *cpu_addr;
...@@ -357,6 +358,7 @@ struct radeon_fence_driver { ...@@ -357,6 +358,7 @@ struct radeon_fence_driver {
uint64_t sync_seq[RADEON_NUM_RINGS]; uint64_t sync_seq[RADEON_NUM_RINGS];
atomic64_t last_seq; atomic64_t last_seq;
bool initialized; bool initialized;
struct delayed_work lockup_work;
}; };
struct radeon_fence { struct radeon_fence {
......
...@@ -97,6 +97,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring) ...@@ -97,6 +97,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring)
return seq; return seq;
} }
/**
* radeon_fence_schedule_check - schedule lockup check
*
* @rdev: radeon_device pointer
* @ring: ring index we should work with
*
* Queues a delayed work item to check for lockups.
*/
static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring)
{
/*
* Do not reset the timer here with mod_delayed_work,
* this can livelock in an interaction with TTM delayed destroy.
*/
queue_delayed_work(system_power_efficient_wq,
&rdev->fence_drv[ring].lockup_work,
RADEON_FENCE_JIFFIES_TIMEOUT);
}
/** /**
* radeon_fence_emit - emit a fence on the requested ring * radeon_fence_emit - emit a fence on the requested ring
* *
...@@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev, ...@@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev,
(*fence)->ring = ring; (*fence)->ring = ring;
radeon_fence_ring_emit(rdev, ring, *fence); radeon_fence_ring_emit(rdev, ring, *fence);
trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
radeon_fence_schedule_check(rdev, ring);
return 0; return 0;
} }
/** /**
* radeon_fence_process - process a fence * radeon_fence_activity - check for fence activity
* *
* @rdev: radeon_device pointer * @rdev: radeon_device pointer
* @ring: ring index the fence is associated with * @ring: ring index the fence is associated with
* *
* Checks the current fence value and wakes the fence queue * Checks the current fence value and calculates the last
* if the sequence number has increased (all asics). * signalled fence value. Returns true if activity occured
* on the ring, and the fence_queue should be waken up.
*/ */
void radeon_fence_process(struct radeon_device *rdev, int ring) static bool radeon_fence_activity(struct radeon_device *rdev, int ring)
{ {
uint64_t seq, last_seq, last_emitted; uint64_t seq, last_seq, last_emitted;
unsigned count_loop = 0; unsigned count_loop = 0;
...@@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) ...@@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
} }
} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
if (wake) if (seq < last_emitted)
radeon_fence_schedule_check(rdev, ring);
return wake;
}
/**
* radeon_fence_check_lockup - check for hardware lockup
*
* @work: delayed work item
*
* Checks for fence activity and if there is none probe
* the hardware if a lockup occured.
*/
static void radeon_fence_check_lockup(struct work_struct *work)
{
struct radeon_fence_driver *fence_drv;
struct radeon_device *rdev;
int ring;
fence_drv = container_of(work, struct radeon_fence_driver,
lockup_work.work);
rdev = fence_drv->rdev;
ring = fence_drv - &rdev->fence_drv[0];
if (!down_read_trylock(&rdev->exclusive_lock)) {
/* just reschedule the check if a reset is going on */
radeon_fence_schedule_check(rdev, ring);
return;
}
if (radeon_fence_activity(rdev, ring))
wake_up_all(&rdev->fence_queue);
else if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
/* good news we believe it's a lockup */
dev_warn(rdev->dev, "GPU lockup (current fence id "
"0x%016llx last fence id 0x%016llx on ring %d)\n",
(uint64_t)atomic64_read(&fence_drv->last_seq),
fence_drv->sync_seq[ring], ring);
/* remember that we need an reset */
rdev->needs_reset = true;
wake_up_all(&rdev->fence_queue);
}
up_read(&rdev->exclusive_lock);
}
/**
* radeon_fence_process - process a fence
*
* @rdev: radeon_device pointer
* @ring: ring index the fence is associated with
*
* Checks the current fence value and wakes the fence queue
* if the sequence number has increased (all asics).
*/
void radeon_fence_process(struct radeon_device *rdev, int ring)
{
if (radeon_fence_activity(rdev, ring))
wake_up_all(&rdev->fence_queue); wake_up_all(&rdev->fence_queue);
} }
...@@ -300,86 +381,43 @@ static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq) ...@@ -300,86 +381,43 @@ static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq, static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
bool intr) bool intr)
{ {
uint64_t last_seq[RADEON_NUM_RINGS]; long r;
bool signaled; int i;
int i, r;
while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
/* Save current sequence values, used to check for GPU lockups */ if (radeon_fence_any_seq_signaled(rdev, target_seq))
for (i = 0; i < RADEON_NUM_RINGS; ++i) { return 0;
if (!target_seq[i])
continue;
last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq); /* enable IRQs and tracing */
trace_radeon_fence_wait_begin(rdev->ddev, i, target_seq[i]); for (i = 0; i < RADEON_NUM_RINGS; ++i) {
radeon_irq_kms_sw_irq_get(rdev, i); if (!target_seq[i])
} continue;
if (intr) { trace_radeon_fence_wait_begin(rdev->ddev, i, target_seq[i]);
r = wait_event_interruptible_timeout(rdev->fence_queue, ( radeon_irq_kms_sw_irq_get(rdev, i);
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq)) }
|| rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
} else {
r = wait_event_timeout(rdev->fence_queue, (
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
|| rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
}
for (i = 0; i < RADEON_NUM_RINGS; ++i) { if (intr) {
if (!target_seq[i]) r = wait_event_interruptible_timeout(rdev->fence_queue, (
continue; radeon_fence_any_seq_signaled(rdev, target_seq)
|| rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
} else {
r = wait_event_timeout(rdev->fence_queue, (
radeon_fence_any_seq_signaled(rdev, target_seq)
|| rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
}
radeon_irq_kms_sw_irq_put(rdev, i); if (rdev->needs_reset)
trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]); r = -EDEADLK;
}
if (unlikely(r < 0)) for (i = 0; i < RADEON_NUM_RINGS; ++i) {
return r; if (!target_seq[i])
continue;
if (unlikely(!signaled)) { radeon_irq_kms_sw_irq_put(rdev, i);
if (rdev->needs_reset) trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
return -EDEADLK;
/* we were interrupted for some reason and fence
* isn't signaled yet, resume waiting */
if (r)
continue;
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
if (!target_seq[i])
continue;
if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
break;
}
if (i != RADEON_NUM_RINGS)
continue;
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
if (!target_seq[i])
continue;
if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
break;
}
if (i < RADEON_NUM_RINGS) {
/* good news we believe it's a lockup */
dev_warn(rdev->dev, "GPU lockup (waiting for "
"0x%016llx last fence id 0x%016llx on"
" ring %d)\n",
target_seq[i], last_seq[i], i);
/* remember that we need an reset */
rdev->needs_reset = true;
wake_up_all(&rdev->fence_queue);
return -EDEADLK;
}
}
} }
return 0;
return r < 0 ? r : 0;
} }
/** /**
...@@ -711,6 +749,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring) ...@@ -711,6 +749,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
rdev->fence_drv[ring].sync_seq[i] = 0; rdev->fence_drv[ring].sync_seq[i] = 0;
atomic64_set(&rdev->fence_drv[ring].last_seq, 0); atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
rdev->fence_drv[ring].initialized = false; rdev->fence_drv[ring].initialized = false;
INIT_DELAYED_WORK(&rdev->fence_drv[ring].lockup_work,
radeon_fence_check_lockup);
rdev->fence_drv[ring].rdev = rdev;
} }
/** /**
...@@ -760,6 +801,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) ...@@ -760,6 +801,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
/* no need to trigger GPU reset as we are unloading */ /* no need to trigger GPU reset as we are unloading */
radeon_fence_driver_force_completion(rdev, ring); radeon_fence_driver_force_completion(rdev, ring);
} }
cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
wake_up_all(&rdev->fence_queue); wake_up_all(&rdev->fence_queue);
radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
rdev->fence_drv[ring].initialized = false; rdev->fence_drv[ring].initialized = false;
...@@ -778,8 +820,10 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) ...@@ -778,8 +820,10 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
*/ */
void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring) void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring)
{ {
if (rdev->fence_drv[ring].initialized) if (rdev->fence_drv[ring].initialized) {
radeon_fence_write(rdev, rdev->fence_drv[ring].sync_seq[ring], ring); radeon_fence_write(rdev, rdev->fence_drv[ring].sync_seq[ring], ring);
cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
}
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment