Commit 8fdf074f authored by Monk Liu's avatar Monk Liu Committed by Alex Deucher

drm/amdgpu:fix world switch hang

for SR-IOV, we must keep the pipeline-sync in the protection
of COND_EXEC, otherwise the command consumed by CPG is not
consistent when world switch triggerd, e.g.:

world switch hit and the IB frame is skipped so the fence
won't signal, thus CP will jump to the next DMAframe's pipeline-sync
command, and it will make CP hang foever.

after pipelin-sync moved into COND_EXEC the consistency can be
guaranteed
Signed-off-by: default avatarMonk Liu <Monk.Liu@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent bdb8cd10
...@@ -130,6 +130,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, ...@@ -130,6 +130,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
unsigned i; unsigned i;
int r = 0; int r = 0;
bool need_pipe_sync = false;
if (num_ibs == 0) if (num_ibs == 0)
return -EINVAL; return -EINVAL;
...@@ -165,7 +166,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, ...@@ -165,7 +166,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
if (ring->funcs->emit_pipeline_sync && job && if (ring->funcs->emit_pipeline_sync && job &&
((tmp = amdgpu_sync_get_fence(&job->sched_sync)) || ((tmp = amdgpu_sync_get_fence(&job->sched_sync)) ||
amdgpu_vm_need_pipeline_sync(ring, job))) { amdgpu_vm_need_pipeline_sync(ring, job))) {
amdgpu_ring_emit_pipeline_sync(ring); need_pipe_sync = true;
dma_fence_put(tmp); dma_fence_put(tmp);
} }
...@@ -173,7 +174,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, ...@@ -173,7 +174,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
ring->funcs->insert_start(ring); ring->funcs->insert_start(ring);
if (job) { if (job) {
r = amdgpu_vm_flush(ring, job); r = amdgpu_vm_flush(ring, job, need_pipe_sync);
if (r) { if (r) {
amdgpu_ring_undo(ring); amdgpu_ring_undo(ring);
return r; return r;
......
...@@ -743,7 +743,7 @@ static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev) ...@@ -743,7 +743,7 @@ static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
* *
* Emit a VM flush when it is necessary. * Emit a VM flush when it is necessary.
*/ */
int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
{ {
struct amdgpu_device *adev = ring->adev; struct amdgpu_device *adev = ring->adev;
unsigned vmhub = ring->funcs->vmhub; unsigned vmhub = ring->funcs->vmhub;
...@@ -765,12 +765,15 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) ...@@ -765,12 +765,15 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
vm_flush_needed = true; vm_flush_needed = true;
} }
if (!vm_flush_needed && !gds_switch_needed) if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
return 0; return 0;
if (ring->funcs->init_cond_exec) if (ring->funcs->init_cond_exec)
patch_offset = amdgpu_ring_init_cond_exec(ring); patch_offset = amdgpu_ring_init_cond_exec(ring);
if (need_pipe_sync)
amdgpu_ring_emit_pipeline_sync(ring);
if (ring->funcs->emit_vm_flush && vm_flush_needed) { if (ring->funcs->emit_vm_flush && vm_flush_needed) {
struct dma_fence *fence; struct dma_fence *fence;
......
...@@ -222,7 +222,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev, ...@@ -222,7 +222,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
struct amdgpu_sync *sync, struct dma_fence *fence, struct amdgpu_sync *sync, struct dma_fence *fence,
struct amdgpu_job *job); struct amdgpu_job *job);
int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job); int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync);
void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub, void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
unsigned vmid); unsigned vmid);
void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev); void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment