Commit 5c0a1cdd authored by Yunxiang Li's avatar Yunxiang Li Committed by Alex Deucher

drm/amdgpu: fix sriov host flr handler

We send back the ready to reset message before we stop anything. This is
wrong. Move it to when we are actually ready for the FLR to happen.

In the current state since we take tens of seconds to stop everything,
it is very likely that host would give up waiting and reset the GPU
before we send ready, so it would be the same as before. But this gets
rid of the hack with reset_domain locking and also let us tell how slow
ready to reset actually is from the host. The ready to reset speed can
be improved later.
Signed-off-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarEmily Deng <Emily.Deng@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent b3948ad1
...@@ -5070,6 +5070,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, ...@@ -5070,6 +5070,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive = NULL; struct amdgpu_hive_info *hive = NULL;
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
amdgpu_virt_ready_to_reset(adev);
amdgpu_virt_wait_reset(adev);
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true); r = amdgpu_virt_request_full_gpu(adev, true);
} else { } else {
......
...@@ -152,6 +152,20 @@ void amdgpu_virt_request_init_data(struct amdgpu_device *adev) ...@@ -152,6 +152,20 @@ void amdgpu_virt_request_init_data(struct amdgpu_device *adev)
DRM_WARN("host doesn't support REQ_INIT_DATA handshake\n"); DRM_WARN("host doesn't support REQ_INIT_DATA handshake\n");
} }
/**
* amdgpu_virt_ready_to_reset() - send ready to reset to host
* @adev: amdgpu device.
* Send ready to reset message to GPU hypervisor to signal we have stopped GPU
* activity and is ready for host FLR
*/
void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
if (virt->ops && virt->ops->reset_gpu)
virt->ops->ready_to_reset(adev);
}
/** /**
* amdgpu_virt_wait_reset() - wait for reset gpu completed * amdgpu_virt_wait_reset() - wait for reset gpu completed
* @adev: amdgpu device. * @adev: amdgpu device.
......
...@@ -88,6 +88,7 @@ struct amdgpu_virt_ops { ...@@ -88,6 +88,7 @@ struct amdgpu_virt_ops {
int (*rel_full_gpu)(struct amdgpu_device *adev, bool init); int (*rel_full_gpu)(struct amdgpu_device *adev, bool init);
int (*req_init_data)(struct amdgpu_device *adev); int (*req_init_data)(struct amdgpu_device *adev);
int (*reset_gpu)(struct amdgpu_device *adev); int (*reset_gpu)(struct amdgpu_device *adev);
void (*ready_to_reset)(struct amdgpu_device *adev);
int (*wait_reset)(struct amdgpu_device *adev); int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req, void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3); u32 data1, u32 data2, u32 data3);
...@@ -347,6 +348,7 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); ...@@ -347,6 +348,7 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
void amdgpu_virt_request_init_data(struct amdgpu_device *adev); void amdgpu_virt_request_init_data(struct amdgpu_device *adev);
void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
int amdgpu_virt_wait_reset(struct amdgpu_device *adev); int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
......
...@@ -249,38 +249,30 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev, ...@@ -249,38 +249,30 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
return 0; return 0;
} }
static void xgpu_ai_mailbox_flr_work(struct work_struct *work) static void xgpu_ai_ready_to_reset(struct amdgpu_device *adev)
{ {
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*/
if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
return;
down_write(&adev->reset_domain->sem);
amdgpu_virt_fini_data_exchange(adev);
xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
}
static int xgpu_ai_wait_reset(struct amdgpu_device *adev)
{
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
do { do {
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done; return 0;
msleep(10); msleep(10);
timeout -= 10; timeout -= 10;
} while (timeout > 1); } while (timeout > 1);
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
return -ETIME;
}
flr_done: static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
atomic_set(&adev->reset_domain->in_gpu_reset, 0); {
up_write(&adev->reset_domain->sem); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
amdgpu_virt_fini_data_exchange(adev);
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)
...@@ -417,7 +409,8 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = { ...@@ -417,7 +409,8 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
.req_full_gpu = xgpu_ai_request_full_gpu_access, .req_full_gpu = xgpu_ai_request_full_gpu_access,
.rel_full_gpu = xgpu_ai_release_full_gpu_access, .rel_full_gpu = xgpu_ai_release_full_gpu_access,
.reset_gpu = xgpu_ai_request_reset, .reset_gpu = xgpu_ai_request_reset,
.wait_reset = NULL, .ready_to_reset = xgpu_ai_ready_to_reset,
.wait_reset = xgpu_ai_wait_reset,
.trans_msg = xgpu_ai_mailbox_trans_msg, .trans_msg = xgpu_ai_mailbox_trans_msg,
.req_init_data = xgpu_ai_request_init_data, .req_init_data = xgpu_ai_request_init_data,
.ras_poison_handler = xgpu_ai_ras_poison_handler, .ras_poison_handler = xgpu_ai_ras_poison_handler,
......
...@@ -282,38 +282,30 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev, ...@@ -282,38 +282,30 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
return 0; return 0;
} }
static void xgpu_nv_mailbox_flr_work(struct work_struct *work) static void xgpu_nv_ready_to_reset(struct amdgpu_device *adev)
{ {
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*/
if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
return;
down_write(&adev->reset_domain->sem);
amdgpu_virt_fini_data_exchange(adev);
xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
}
static int xgpu_nv_wait_reset(struct amdgpu_device *adev)
{
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
do { do {
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
goto flr_done; return 0;
msleep(10); msleep(10);
timeout -= 10; timeout -= 10;
} while (timeout > 1); } while (timeout > 1);
dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
return -ETIME;
}
flr_done: static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
atomic_set(&adev->reset_domain->in_gpu_reset, 0); {
up_write(&adev->reset_domain->sem); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
amdgpu_virt_fini_data_exchange(adev);
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)
...@@ -455,7 +447,8 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { ...@@ -455,7 +447,8 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.rel_full_gpu = xgpu_nv_release_full_gpu_access, .rel_full_gpu = xgpu_nv_release_full_gpu_access,
.req_init_data = xgpu_nv_request_init_data, .req_init_data = xgpu_nv_request_init_data,
.reset_gpu = xgpu_nv_request_reset, .reset_gpu = xgpu_nv_request_reset,
.wait_reset = NULL, .ready_to_reset = xgpu_nv_ready_to_reset,
.wait_reset = xgpu_nv_wait_reset,
.trans_msg = xgpu_nv_mailbox_trans_msg, .trans_msg = xgpu_nv_mailbox_trans_msg,
.ras_poison_handler = xgpu_nv_ras_poison_handler, .ras_poison_handler = xgpu_nv_ras_poison_handler,
}; };
...@@ -515,12 +515,6 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) ...@@ -515,12 +515,6 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
/* wait until RCV_MSG become 3 */
if (xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) {
pr_err("failed to receive FLR_CMPL\n");
return;
}
/* Trigger recovery due to world switch failure */ /* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev)) { if (amdgpu_device_should_recover_gpu(adev)) {
struct amdgpu_reset_context reset_context; struct amdgpu_reset_context reset_context;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment