Commit b80cd524 authored by Felix Kuehling's avatar Felix Kuehling Committed by Alex Deucher

drm/amdgpu: Improve Vega20 XGMI TLB flush workaround

Using a heavy-weight TLB flush once is not sufficient. Concurrent
memory accesses in the same TLB cache line can re-populate TLB entries
from stale texture cache (TC) entries while the heavy-weight TLB
flush is in progress. To fix this race condition, perform another TLB
flush after the heavy-weight one, when TC is known to be clean.

Move the workaround into the low-level TLB flushing functions. This way
they apply to amdgpu as well, and KIQ-based TLB flush only needs to
synchronize once.
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarshaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 82c4ebfa
...@@ -647,13 +647,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) ...@@ -647,13 +647,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid) int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
uint32_t flush_type = 0; const uint32_t flush_type = 0;
bool all_hub = false; bool all_hub = false;
if (adev->gmc.xgmi.num_physical_nodes &&
adev->asic_type == CHIP_VEGA20)
flush_type = 2;
if (adev->family == AMDGPU_FAMILY_AI) if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true; all_hub = true;
......
...@@ -476,13 +476,26 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ...@@ -476,13 +476,26 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
{ {
bool use_semaphore = gmc_v9_0_use_invalidate_semaphore(adev, vmhub); bool use_semaphore = gmc_v9_0_use_invalidate_semaphore(adev, vmhub);
const unsigned eng = 17; const unsigned eng = 17;
u32 j, inv_req, tmp; u32 j, inv_req, inv_req2, tmp;
struct amdgpu_vmhub *hub; struct amdgpu_vmhub *hub;
BUG_ON(vmhub >= adev->num_vmhubs); BUG_ON(vmhub >= adev->num_vmhubs);
hub = &adev->vmhub[vmhub]; hub = &adev->vmhub[vmhub];
inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type); if (adev->gmc.xgmi.num_physical_nodes &&
adev->asic_type == CHIP_VEGA20) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
* heavy-weight TLB flush (type 2), which flushes
* both. Due to a race condition with concurrent
* memory accesses using the same TLB cache line, we
* still need a second TLB flush after this.
*/
inv_req = gmc_v9_0_get_invalidate_req(vmid, 2);
inv_req2 = gmc_v9_0_get_invalidate_req(vmid, flush_type);
} else {
inv_req = gmc_v9_0_get_invalidate_req(vmid, flush_type);
inv_req2 = 0;
}
/* This is necessary for a HW workaround under SRIOV as well /* This is necessary for a HW workaround under SRIOV as well
* as GFXOFF under bare metal * as GFXOFF under bare metal
...@@ -521,21 +534,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ...@@ -521,21 +534,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n"); DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
} }
WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req); do {
WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, inv_req);
/* /*
* Issue a dummy read to wait for the ACK register to be cleared * Issue a dummy read to wait for the ACK register to
* to avoid a false ACK due to the new fast GRBM interface. * be cleared to avoid a false ACK due to the new fast
*/ * GRBM interface.
if (vmhub == AMDGPU_GFXHUB_0) */
RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng); if (vmhub == AMDGPU_GFXHUB_0)
RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng);
for (j = 0; j < adev->usec_timeout; j++) { for (j = 0; j < adev->usec_timeout; j++) {
tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng); tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
if (tmp & (1 << vmid)) if (tmp & (1 << vmid))
break; break;
udelay(1); udelay(1);
} }
inv_req = inv_req2;
inv_req2 = 0;
} while (inv_req);
/* TODO: It needs to continue working on debugging with semaphore for GFXHUB as well. */ /* TODO: It needs to continue working on debugging with semaphore for GFXHUB as well. */
if (use_semaphore) if (use_semaphore)
...@@ -577,9 +596,26 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, ...@@ -577,9 +596,26 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
return -EIO; return -EIO;
if (ring->sched.ready) { if (ring->sched.ready) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
* heavy-weight TLB flush (type 2), which flushes
* both. Due to a race condition with concurrent
* memory accesses using the same TLB cache line, we
* still need a second TLB flush after this.
*/
bool vega20_xgmi_wa = (adev->gmc.xgmi.num_physical_nodes &&
adev->asic_type == CHIP_VEGA20);
/* 2 dwords flush + 8 dwords fence */
unsigned int ndw = kiq->pmf->invalidate_tlbs_size + 8;
if (vega20_xgmi_wa)
ndw += kiq->pmf->invalidate_tlbs_size;
spin_lock(&adev->gfx.kiq.ring_lock); spin_lock(&adev->gfx.kiq.ring_lock);
/* 2 dwords flush + 8 dwords fence */ /* 2 dwords flush + 8 dwords fence */
amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8); amdgpu_ring_alloc(ring, ndw);
if (vega20_xgmi_wa)
kiq->pmf->kiq_invalidate_tlbs(ring,
pasid, 2, all_hub);
kiq->pmf->kiq_invalidate_tlbs(ring, kiq->pmf->kiq_invalidate_tlbs(ring,
pasid, flush_type, all_hub); pasid, flush_type, all_hub);
amdgpu_fence_emit_polling(ring, &seq); amdgpu_fence_emit_polling(ring, &seq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment