Commit 41cca166 authored by Marek Olšák's avatar Marek Olšák Committed by Alex Deucher

drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

v2: update emit_ib_size
Signed-off-by: default avatarMarek Olšák <marek.olsak@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 67dd1a36
...@@ -72,9 +72,10 @@ ...@@ -72,9 +72,10 @@
* - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
* - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
* - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
* - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
*/ */
#define KMS_DRIVER_MAJOR 3 #define KMS_DRIVER_MAJOR 3
#define KMS_DRIVER_MINOR 28 #define KMS_DRIVER_MINOR 29
#define KMS_DRIVER_PATCHLEVEL 0 #define KMS_DRIVER_PATCHLEVEL 0
int amdgpu_vram_limit = 0; int amdgpu_vram_limit = 0;
......
...@@ -37,6 +37,8 @@ struct amdgpu_gds { ...@@ -37,6 +37,8 @@ struct amdgpu_gds {
struct amdgpu_gds_asic_info mem; struct amdgpu_gds_asic_info mem;
struct amdgpu_gds_asic_info gws; struct amdgpu_gds_asic_info gws;
struct amdgpu_gds_asic_info oa; struct amdgpu_gds_asic_info oa;
uint32_t gds_compute_max_wave_id;
/* At present, GDS, GWS and OA resources for gfx (graphics) /* At present, GDS, GWS and OA resources for gfx (graphics)
* is always pre-allocated and available for graphics operation. * is always pre-allocated and available for graphics operation.
* Such resource is shared between all gfx clients. * Such resource is shared between all gfx clients.
......
...@@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, ...@@ -2264,6 +2264,22 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job); unsigned vmid = AMDGPU_JOB_GET_VMID(job);
u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
/* Currently, there is a high possibility to get wave ID mismatch
* between ME and GDS, leading to a hw deadlock, because ME generates
* different wave IDs than the GDS expects. This situation happens
* randomly when at least 5 compute pipes use GDS ordered append.
* The wave IDs generated by ME are also wrong after suspend/resume.
* Those are probably bugs somewhere else in the kernel driver.
*
* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
* GDS to 0 for this ring (me/pipe).
*/
if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
}
amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
amdgpu_ring_write(ring, amdgpu_ring_write(ring,
#ifdef __BIG_ENDIAN #ifdef __BIG_ENDIAN
...@@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = { ...@@ -5000,7 +5016,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
7 + /* gfx_v7_0_ring_emit_pipeline_sync */ 7 + /* gfx_v7_0_ring_emit_pipeline_sync */
CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */ CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */ 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
.emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */ .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
.emit_ib = gfx_v7_0_ring_emit_ib_compute, .emit_ib = gfx_v7_0_ring_emit_ib_compute,
.emit_fence = gfx_v7_0_ring_emit_fence_compute, .emit_fence = gfx_v7_0_ring_emit_fence_compute,
.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync, .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
...@@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev) ...@@ -5057,6 +5073,7 @@ static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
adev->gds.gws.total_size = 64; adev->gds.gws.total_size = 64;
adev->gds.oa.total_size = 16; adev->gds.oa.total_size = 16;
adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
if (adev->gds.mem.total_size == 64 * 1024) { if (adev->gds.mem.total_size == 64 * 1024) {
adev->gds.mem.gfx_partition_size = 4096; adev->gds.mem.gfx_partition_size = 4096;
......
...@@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring, ...@@ -6084,6 +6084,22 @@ static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job); unsigned vmid = AMDGPU_JOB_GET_VMID(job);
u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
/* Currently, there is a high possibility to get wave ID mismatch
* between ME and GDS, leading to a hw deadlock, because ME generates
* different wave IDs than the GDS expects. This situation happens
* randomly when at least 5 compute pipes use GDS ordered append.
* The wave IDs generated by ME are also wrong after suspend/resume.
* Those are probably bugs somewhere else in the kernel driver.
*
* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
* GDS to 0 for this ring (me/pipe).
*/
if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
}
amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
amdgpu_ring_write(ring, amdgpu_ring_write(ring,
#ifdef __BIG_ENDIAN #ifdef __BIG_ENDIAN
...@@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { ...@@ -6890,7 +6906,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */ VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
.emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
.emit_ib = gfx_v8_0_ring_emit_ib_compute, .emit_ib = gfx_v8_0_ring_emit_ib_compute,
.emit_fence = gfx_v8_0_ring_emit_fence_compute, .emit_fence = gfx_v8_0_ring_emit_fence_compute,
.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
...@@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { ...@@ -6920,7 +6936,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
17 + /* gfx_v8_0_ring_emit_vm_flush */ 17 + /* gfx_v8_0_ring_emit_vm_flush */
7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
.emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
.emit_fence = gfx_v8_0_ring_emit_fence_kiq, .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
.test_ring = gfx_v8_0_ring_test_ring, .test_ring = gfx_v8_0_ring_test_ring,
.insert_nop = amdgpu_ring_insert_nop, .insert_nop = amdgpu_ring_insert_nop,
...@@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev) ...@@ -6996,6 +7012,7 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
adev->gds.gws.total_size = 64; adev->gds.gws.total_size = 64;
adev->gds.oa.total_size = 16; adev->gds.oa.total_size = 16;
adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
if (adev->gds.mem.total_size == 64 * 1024) { if (adev->gds.mem.total_size == 64 * 1024) {
adev->gds.mem.gfx_partition_size = 4096; adev->gds.mem.gfx_partition_size = 4096;
......
...@@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, ...@@ -4010,6 +4010,22 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job); unsigned vmid = AMDGPU_JOB_GET_VMID(job);
u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
/* Currently, there is a high possibility to get wave ID mismatch
* between ME and GDS, leading to a hw deadlock, because ME generates
* different wave IDs than the GDS expects. This situation happens
* randomly when at least 5 compute pipes use GDS ordered append.
* The wave IDs generated by ME are also wrong after suspend/resume.
* Those are probably bugs somewhere else in the kernel driver.
*
* Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
* GDS to 0 for this ring (me/pipe).
*/
if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
}
amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
BUG_ON(ib->gpu_addr & 0x3); /* Dword align */ BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
amdgpu_ring_write(ring, amdgpu_ring_write(ring,
...@@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { ...@@ -4729,7 +4745,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
2 + /* gfx_v9_0_ring_emit_vm_flush */ 2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
.emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
.emit_ib = gfx_v9_0_ring_emit_ib_compute, .emit_ib = gfx_v9_0_ring_emit_ib_compute,
.emit_fence = gfx_v9_0_ring_emit_fence, .emit_fence = gfx_v9_0_ring_emit_fence,
.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync, .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
...@@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { ...@@ -4764,7 +4780,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
2 + /* gfx_v9_0_ring_emit_vm_flush */ 2 + /* gfx_v9_0_ring_emit_vm_flush */
8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
.emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
.emit_fence = gfx_v9_0_ring_emit_fence_kiq, .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
.test_ring = gfx_v9_0_ring_test_ring, .test_ring = gfx_v9_0_ring_test_ring,
.insert_nop = amdgpu_ring_insert_nop, .insert_nop = amdgpu_ring_insert_nop,
...@@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev) ...@@ -4846,6 +4862,26 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
break; break;
} }
switch (adev->asic_type) {
case CHIP_VEGA10:
case CHIP_VEGA20:
adev->gds.gds_compute_max_wave_id = 0x7ff;
break;
case CHIP_VEGA12:
adev->gds.gds_compute_max_wave_id = 0x27f;
break;
case CHIP_RAVEN:
if (adev->rev_id >= 0x8)
adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
else
adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
break;
default:
/* this really depends on the chip */
adev->gds.gds_compute_max_wave_id = 0x7ff;
break;
}
adev->gds.gws.total_size = 64; adev->gds.gws.total_size = 64;
adev->gds.oa.total_size = 16; adev->gds.oa.total_size = 16;
......
...@@ -566,6 +566,11 @@ union drm_amdgpu_cs { ...@@ -566,6 +566,11 @@ union drm_amdgpu_cs {
* caches (L2/vL1/sL1/I$). */ * caches (L2/vL1/sL1/I$). */
#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
* This will reset wave ID counters for the IB.
*/
#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
struct drm_amdgpu_cs_chunk_ib { struct drm_amdgpu_cs_chunk_ib {
__u32 _pad; __u32 _pad;
/** AMDGPU_IB_FLAG_* */ /** AMDGPU_IB_FLAG_* */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment