Commit cbda2758 authored by Vignesh Chander's avatar Vignesh Chander Committed by Alex Deucher

drm/amdgpu: process RAS fatal error MB notification

For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.

v2: add another mailbox check for handling case where kfd detects
timeout first

v3: set host_flr bit and use wait_for_reset
Signed-off-by: default avatarVignesh Chander <Vignesh.Chander@amd.com>
Reviewed-by: default avatarZhigang Luo <Zhigang.Luo@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 78146c1d
......@@ -5069,7 +5069,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive = NULL;
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
amdgpu_virt_ready_to_reset(adev);
if (!amdgpu_ras_get_fed_status(adev))
amdgpu_virt_ready_to_reset(adev);
amdgpu_virt_wait_reset(adev);
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
......@@ -5837,6 +5838,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
amdgpu_ras_set_fed(adev, true);
set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
}
r = amdgpu_device_reset_sriov(adev, reset_context);
if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
amdgpu_virt_release_full_gpu(adev, true);
......
......@@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
adev->virt.mm_table.gpu_addr = 0;
}
/**
* amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
* @adev: amdgpu device.
* Check whether host sent RAS error message
* Return: true if found, otherwise false
*/
bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
if (!virt->ops || !virt->ops->rcvd_ras_intr)
return false;
return virt->ops->rcvd_ras_intr(adev);
}
unsigned int amd_sriov_msg_checksum(void *obj,
unsigned long obj_size,
......@@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
ret = amdgpu_virt_read_pf2vf_data(adev);
if (ret) {
adev->virt.vf2pf_update_retry_cnt++;
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
amdgpu_sriov_runtime(adev)) {
if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
amdgpu_sriov_runtime(adev)) {
amdgpu_ras_set_fed(adev, true);
if (amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->kfd.reset_work))
&adev->kfd.reset_work))
return;
else
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
......
......@@ -52,7 +52,7 @@
/* tonga/fiji use this offset */
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
enum amdgpu_sriov_vf_mode {
SRIOV_VF_MODE_BARE_METAL = 0,
......@@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
u32 data1, u32 data2, u32 data3);
void (*ras_poison_handler)(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
};
/*
......@@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
......
......@@ -408,6 +408,13 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
}
static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev)
{
enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev);
return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
}
const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.req_full_gpu = xgpu_ai_request_full_gpu_access,
.rel_full_gpu = xgpu_ai_release_full_gpu_access,
......@@ -417,4 +424,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.trans_msg = xgpu_ai_mailbox_trans_msg,
.req_init_data = xgpu_ai_request_init_data,
.ras_poison_handler = xgpu_ai_ras_poison_handler,
.rcvd_ras_intr = xgpu_ai_rcvd_ras_intr,
};
......@@ -51,7 +51,9 @@ enum idh_event {
IDH_FAIL,
IDH_QUERY_ALIVE,
IDH_REQ_GPU_INIT_DATA_READY,
IDH_RAS_POISON_READY,
IDH_PF_SOFT_FLR_NOTIFICATION,
IDH_RAS_ERROR_DETECTED,
IDH_TEXT_MESSAGE = 255,
};
......
......@@ -449,6 +449,13 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
}
}
static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
{
enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
}
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.req_full_gpu = xgpu_nv_request_full_gpu_access,
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
......@@ -458,4 +465,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.wait_reset = xgpu_nv_wait_reset,
.trans_msg = xgpu_nv_mailbox_trans_msg,
.ras_poison_handler = xgpu_nv_ras_poison_handler,
.rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
};
......@@ -26,7 +26,7 @@
#define NV_MAILBOX_POLL_ACK_TIMEDOUT 500
#define NV_MAILBOX_POLL_MSG_TIMEDOUT 6000
#define NV_MAILBOX_POLL_FLR_TIMEDOUT 5000
#define NV_MAILBOX_POLL_FLR_TIMEDOUT 10000
#define NV_MAILBOX_POLL_MSG_REP_MAX 11
enum idh_request {
......@@ -52,7 +52,8 @@ enum idh_event {
IDH_QUERY_ALIVE,
IDH_REQ_GPU_INIT_DATA_READY,
IDH_RAS_POISON_READY,
IDH_PF_SOFT_FLR_NOTIFICATION,
IDH_RAS_ERROR_DETECTED,
IDH_TEXT_MESSAGE = 255,
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment