Commit 45bbf800 authored by Laurent Morichetti's avatar Laurent Morichetti Committed by Alex Deucher

drm/amdkfd: Use SQC when TCP would fail in gfx10.1 context save

Similarly to gfx9, gfx10.1 drops vector stores when an xnack error is
raised. To work around this issue, use scalar stores instead of vector
stores when trapsts.xnack_error == 1.
Signed-off-by: default avatarLaurent Morichetti <laurent.morichetti@amd.com>
Reviewed-by: default avatarJay Cornwall <jay.cornwall@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f36e3f72
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
#define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO)
#define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
...@@ -86,6 +87,7 @@ var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 ...@@ -86,6 +87,7 @@ var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000
var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000
var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000
#endif #endif
var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000
var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12 var SQ_WAVE_MODE_EXCP_EN_SHIFT = 12
var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19 var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT = 19
...@@ -475,6 +477,16 @@ L_SAVE_4VGPR_WAVE32: ...@@ -475,6 +477,16 @@ L_SAVE_4VGPR_WAVE32:
// VGPR Allocated in 4-GPR granularity // VGPR Allocated in 4-GPR granularity
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_FIRST_VGPRS32_WITH_TCP
write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
s_branch L_SAVE_HWREG
L_SAVE_FIRST_VGPRS32_WITH_TCP:
#endif
#if !NO_SQC_STORE #if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
#endif #endif
...@@ -488,6 +500,16 @@ L_SAVE_4VGPR_WAVE64: ...@@ -488,6 +500,16 @@ L_SAVE_4VGPR_WAVE64:
// VGPR Allocated in 4-GPR granularity // VGPR Allocated in 4-GPR granularity
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_FIRST_VGPRS64_WITH_TCP
write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
s_branch L_SAVE_HWREG
L_SAVE_FIRST_VGPRS64_WITH_TCP:
#endif
#if !NO_SQC_STORE #if !NO_SQC_STORE
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
#endif #endif
...@@ -660,6 +682,26 @@ L_SAVE_LDS_NORMAL: ...@@ -660,6 +682,26 @@ L_SAVE_LDS_NORMAL:
s_cbranch_scc1 L_SAVE_LDS_W64 s_cbranch_scc1 L_SAVE_LDS_W64
L_SAVE_LDS_W32: L_SAVE_LDS_W32:
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W32
L_SAVE_LDS_LOOP_SQC_W32:
ds_read_b32 v1, v0
s_waitcnt 0
write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W32 //LDS save is complete?
s_branch L_SAVE_LDS_DONE
L_SAVE_LDS_WITH_TCP_W32:
#endif
s_mov_b32 s3, 128 s_mov_b32 s3, 128
s_nop 0 s_nop 0
s_nop 0 s_nop 0
...@@ -669,7 +711,7 @@ L_SAVE_LDS_LOOP_W32: ...@@ -669,7 +711,7 @@ L_SAVE_LDS_LOOP_W32:
s_waitcnt 0 s_waitcnt 0
buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
...@@ -678,6 +720,26 @@ L_SAVE_LDS_LOOP_W32: ...@@ -678,6 +720,26 @@ L_SAVE_LDS_LOOP_W32:
s_branch L_SAVE_LDS_DONE s_branch L_SAVE_LDS_DONE
L_SAVE_LDS_W64: L_SAVE_LDS_W64:
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_LDS_WITH_TCP_W64
L_SAVE_LDS_LOOP_SQC_W64:
ds_read_b32 v1, v0
s_waitcnt 0
write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_LDS_LOOP_SQC_W64 //LDS save is complete?
s_branch L_SAVE_LDS_DONE
L_SAVE_LDS_WITH_TCP_W64:
#endif
s_mov_b32 s3, 256 s_mov_b32 s3, 256
s_nop 0 s_nop 0
s_nop 0 s_nop 0
...@@ -727,6 +789,25 @@ L_SAVE_VGPR_NORMAL: ...@@ -727,6 +789,25 @@ L_SAVE_VGPR_NORMAL:
s_cmp_lt_u32 m0, s_save_alloc_size s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc0 L_SAVE_VGPR_END s_cbranch_scc0 L_SAVE_VGPR_END
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP
L_SAVE_VGPR_LOOP_SQC_W32:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
write_vgprs_to_mem_with_sqc_w32(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
s_add_u32 m0, m0, 4
s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W32
s_branch L_SAVE_VGPR_END
#endif
L_SAVE_VGPR_W32_LOOP: L_SAVE_VGPR_W32_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0] v_movrels_b32 v1, v1 //v1 = v[1+m0]
...@@ -753,6 +834,25 @@ L_SAVE_VGPR_WAVE64: ...@@ -753,6 +834,25 @@ L_SAVE_VGPR_WAVE64:
s_cmp_lt_u32 m0, s_save_alloc_size s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc0 L_SAVE_SHARED_VGPR s_cbranch_scc0 L_SAVE_SHARED_VGPR
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP
L_SAVE_VGPR_LOOP_SQC_W64:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0]
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
write_vgprs_to_mem_with_sqc_w64(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
s_add_u32 m0, m0, 4
s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC_W64
s_branch L_SAVE_VGPR_END
#endif
L_SAVE_VGPR_W64_LOOP: L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v0, v0 //v0 = v[0+m0]
v_movrels_b32 v1, v1 //v1 = v[1+m0] v_movrels_b32 v1, v1 //v1 = v[1+m0]
...@@ -780,6 +880,23 @@ L_SAVE_SHARED_VGPR: ...@@ -780,6 +880,23 @@ L_SAVE_SHARED_VGPR:
s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0x00000000 s_mov_b32 exec_hi, 0x00000000
#if SAVE_AFTER_XNACK_ERROR
check_if_tcp_store_ok()
s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP
L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC:
v_movrels_b32 v0, v0
write_vgprs_to_mem_with_sqc_w64(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
s_add_u32 m0, m0, 1
s_cmp_lt_u32 m0, s_save_alloc_size
s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC
s_branch L_SAVE_VGPR_END
#endif
L_SAVE_SHARED_VGPR_WAVE64_LOOP: L_SAVE_SHARED_VGPR_WAVE64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0] v_movrels_b32 v0, v0 //v0 = v[0+m0]
buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
...@@ -1190,6 +1307,43 @@ function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) ...@@ -1190,6 +1307,43 @@ function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1
end end
#if SAVE_AFTER_XNACK_ERROR
function check_if_tcp_store_ok
// If TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
L_TCP_STORE_CHECK_DONE:
end
function write_vgpr_to_mem_with_sqc(vgpr, n_lanes, s_rsrc, s_mem_offset)
s_mov_b32 s4, 0
L_WRITE_VGPR_LANE_LOOP:
for var lane = 0; lane < 4; ++lane
v_readlane_b32 s[lane], vgpr, s4
s_add_u32 s4, s4, 1
end
s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
s_add_u32 s_mem_offset, s_mem_offset, 0x10
s_cmp_eq_u32 s4, n_lanes
s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
end
function write_vgprs_to_mem_with_sqc_w32(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
for var vgpr = 0; vgpr < n_vgprs; ++vgpr
write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 32, s_rsrc, s_mem_offset)
end
end
function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
for var vgpr = 0; vgpr < n_vgprs; ++vgpr
write_vgpr_to_mem_with_sqc(vgpr0[vgpr], 64, s_rsrc, s_mem_offset)
end
end
#endif
function get_lds_size_bytes(s_lds_size_byte) function get_lds_size_bytes(s_lds_size_byte)
s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment