Commit 2fc46e0b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: make reset method configurable for RAS poison

Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

v2: remove the mmhub poison support for kfd int v10.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e3d4de8d
...@@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) ...@@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
} }
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset) enum amdgpu_ras_block block, uint32_t reset)
{ {
amdgpu_umc_poison_handler(adev, block, reset); amdgpu_umc_poison_handler(adev, block, reset);
} }
......
...@@ -336,7 +336,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); ...@@ -336,7 +336,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config); struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset); enum amdgpu_ras_block block, uint32_t reset);
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p); void amdgpu_amdkfd_block_mmu_notifications(void *p);
......
...@@ -2051,7 +2051,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * ...@@ -2051,7 +2051,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
} }
} }
amdgpu_umc_poison_handler(adev, obj->head.block, false); amdgpu_umc_poison_handler(adev, obj->head.block, 0);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
...@@ -2704,7 +2704,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) ...@@ -2704,7 +2704,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_dec(&con->page_retirement_req_cnt); atomic_dec(&con->page_retirement_req_cnt);
amdgpu_umc_bad_page_polling_timeout(adev, amdgpu_umc_bad_page_polling_timeout(adev,
false, MAX_UMC_POISON_POLLING_TIME_ASYNC); 0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
} }
return 0; return 0;
......
...@@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, ...@@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry, struct amdgpu_iv_entry *entry,
bool reset) uint32_t reset)
{ {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
...@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status); amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) { if (err_data->ue_count && reset) {
/* use mode-2 reset for poison consumption */ con->gpu_reset_flags |= reset;
if (!entry)
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
...@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, ...@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
} }
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms) uint32_t reset, uint32_t timeout_ms)
{ {
struct ras_err_data err_data; struct ras_err_data err_data;
struct ras_common_if head = { struct ras_common_if head = {
...@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, ...@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
if (reset) { if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/* use mode-2 reset for poison consumption */ con->gpu_reset_flags |= reset;
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
} }
...@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, ...@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
} }
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset) enum amdgpu_ras_block block, uint32_t reset)
{ {
int ret = AMDGPU_RAS_SUCCESS; int ret = AMDGPU_RAS_SUCCESS;
...@@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status, void *ras_error_status,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
} }
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
......
...@@ -101,7 +101,7 @@ struct amdgpu_umc { ...@@ -101,7 +101,7 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset); enum amdgpu_ras_block block, uint32_t reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry);
...@@ -121,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev, ...@@ -121,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data); umc_func func, void *data);
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms); uint32_t reset, uint32_t timeout_ms);
#endif #endif
...@@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
{ {
enum amdgpu_ras_block block = 0; enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL; int old_poison, ret = -EINVAL;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p) if (!p)
...@@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2: case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid); ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX; block = AMDGPU_RAS_BLOCK__GFX;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA1:
...@@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4: case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA; block = AMDGPU_RAS_BLOCK__SDMA;
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
default: default:
break; break;
...@@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, ...@@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset /* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution * resetting queue fails, fallback to gpu reset solution
*/ */
if (!ret) { if (!ret)
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n", "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); else
} else {
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n", "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
} }
static bool event_interrupt_isr_v10(struct kfd_node *dev, static bool event_interrupt_isr_v10(struct kfd_node *dev,
......
...@@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, ...@@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
{ {
enum amdgpu_ras_block block = 0; enum amdgpu_ras_block block = 0;
int ret = -EINVAL; int ret = -EINVAL;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p) if (!p)
...@@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, ...@@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
if (dev->dqm->ops.reset_queues) if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX; block = AMDGPU_RAS_BLOCK__GFX;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
case SOC21_INTSRC_SDMA_ECC: case SOC21_INTSRC_SDMA_ECC:
default: default:
block = AMDGPU_RAS_BLOCK__GFX; block = AMDGPU_RAS_BLOCK__GFX;
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
} }
...@@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, ...@@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset /* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */ resetting queue fails, fallback to gpu reset solution */
if (!ret) amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
else
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} }
static bool event_interrupt_isr_v11(struct kfd_node *dev, static bool event_interrupt_isr_v11(struct kfd_node *dev,
......
...@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
{ {
enum amdgpu_ras_block block = 0; enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL; int old_poison, ret = -EINVAL;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p) if (!p)
...@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2: case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid); ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX; block = AMDGPU_RAS_BLOCK__GFX;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break; break;
case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA1:
...@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4: case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA; block = AMDGPU_RAS_BLOCK__SDMA;
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
default: default:
break; break;
...@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset /* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution * resetting queue fails, fallback to gpu reset solution
*/ */
if (!ret) { if (!ret)
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n", "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); else
} else {
dev_warn(dev->adev->dev, dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n", "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id); client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
} amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
} }
static bool context_id_expected(struct kfd_dev *dev) static bool context_id_expected(struct kfd_dev *dev)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment