Commit ce316fa5 authored by Le Ma's avatar Le Ma Committed by Alex Deucher

drm/amdgpu: add concurrent baco reset support for XGMI

Currently each XGMI node reset wq does not run in parrallel if bound to same
cpu. Make change to bound the xgmi_reset_work item to different cpus.

XGMI requires all nodes enter into baco within very close proximity before
any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
baco respectively.

To use baco for XGMI, PMFW supported for baco on XGMI needs to be involved.

The case that PSP reset and baco reset coexist within an XGMI hive never exist
and is not in the consideration.

v2: define use_baco flag to simplify the code for xgmi baco sequence
Signed-off-by: default avatarLe Ma <le.ma@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 7a22677b
...@@ -992,6 +992,8 @@ struct amdgpu_device { ...@@ -992,6 +992,8 @@ struct amdgpu_device {
bool pm_sysfs_en; bool pm_sysfs_en;
bool ucode_sysfs_en; bool ucode_sysfs_en;
bool in_baco;
}; };
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
......
...@@ -2661,7 +2661,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) ...@@ -2661,7 +2661,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
struct amdgpu_device *adev = struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work); container_of(__work, struct amdgpu_device, xgmi_reset_work);
adev->asic_reset_res = amdgpu_asic_reset(adev); if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
adev->asic_reset_res = (adev->in_baco == false) ?
amdgpu_device_baco_enter(adev->ddev) :
amdgpu_device_baco_exit(adev->ddev);
else
adev->asic_reset_res = amdgpu_asic_reset(adev);
if (adev->asic_reset_res) if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique); adev->asic_reset_res, adev->ddev->unique);
...@@ -3787,13 +3793,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, ...@@ -3787,13 +3793,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r; return r;
} }
static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive,
struct list_head *device_list_handle, struct list_head *device_list_handle,
bool *need_full_reset_arg) bool *need_full_reset_arg)
{ {
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false; bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0; int r = 0;
int cpu = smp_processor_id();
bool use_baco =
(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
true : false;
/* /*
* ASIC reset has to be done on all HGMI hive nodes ASAP * ASIC reset has to be done on all HGMI hive nodes ASAP
...@@ -3801,21 +3812,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -3801,21 +3812,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
*/ */
if (need_full_reset) { if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* For XGMI run all resets in parallel to speed up the process */ /*
* For XGMI run all resets in parallel to speed up the
* process by scheduling the highpri wq on different
* cpus. For XGMI with baco reset, all nodes must enter
* baco within close proximity before anyone exit.
*/
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) if (!queue_work_on(cpu, system_highpri_wq,
&tmp_adev->xgmi_reset_work))
r = -EALREADY; r = -EALREADY;
cpu = cpumask_next(cpu, cpu_online_mask);
} else } else
r = amdgpu_asic_reset(tmp_adev); r = amdgpu_asic_reset(tmp_adev);
if (r)
if (r) {
DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
r, tmp_adev->ddev->unique);
break; break;
}
} }
/* For XGMI wait for all PSP resets to complete before proceed */ /* For XGMI wait for all work to complete before proceed */
if (!r) { if (!r) {
list_for_each_entry(tmp_adev, device_list_handle, list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) { gmc.xgmi.head) {
...@@ -3824,11 +3838,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -3824,11 +3838,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
r = tmp_adev->asic_reset_res; r = tmp_adev->asic_reset_res;
if (r) if (r)
break; break;
if (use_baco)
tmp_adev->in_baco = true;
} }
} }
} }
}
/*
* For XGMI with baco reset, need exit baco phase by scheduling
* xgmi_reset_work one more time. PSP reset and sGPU skips this
* phase. Not assume the situation that PSP reset and baco reset
* coexist within an XGMI hive.
*/
if (!r && use_baco) {
cpu = smp_processor_id();
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work_on(cpu,
system_highpri_wq,
&tmp_adev->xgmi_reset_work))
r = -EALREADY;
if (r)
break;
cpu = cpumask_next(cpu, cpu_online_mask);
}
}
}
if (!r && use_baco) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(&tmp_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
if (r)
break;
tmp_adev->in_baco = false;
}
}
}
if (r) {
DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
r, tmp_adev->ddev->unique);
goto end;
}
}
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) { if (need_full_reset) {
...@@ -4113,7 +4170,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4113,7 +4170,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (r) if (r)
adev->asic_reset_res = r; adev->asic_reset_res = r;
} else { } else {
r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
&need_full_reset);
if (r && r == -EAGAIN) if (r && r == -EAGAIN)
goto retry; goto retry;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment