Commit a4c63caf authored by Andrey Grodzovsky's avatar Andrey Grodzovsky

drm/amdgpu: Introduce reset domain

Defined a reset_domain struct such that
all the entities that go through reset
together will be serialized one against
another. Do it for both single device and
XGMI hive cases.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Suggested-by: default avatarChristian König <ckoenig.leichtzumerken@gmail.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74111.html
parent b21a142f
...@@ -813,6 +813,10 @@ struct amd_powerplay { ...@@ -813,6 +813,10 @@ struct amd_powerplay {
#define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_RESET_MAGIC_NUM 64
#define AMDGPU_MAX_DF_PERFMONS 4 #define AMDGPU_MAX_DF_PERFMONS 4
#define AMDGPU_PRODUCT_NAME_LEN 64 #define AMDGPU_PRODUCT_NAME_LEN 64
struct amdgpu_reset_domain {
struct workqueue_struct *wq;
};
struct amdgpu_device { struct amdgpu_device {
struct device *dev; struct device *dev;
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -1100,6 +1104,7 @@ struct amdgpu_device { ...@@ -1100,6 +1104,7 @@ struct amdgpu_device {
uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
bool ram_is_direct_mapped; bool ram_is_direct_mapped;
struct amdgpu_reset_domain reset_domain;
}; };
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
......
...@@ -2398,9 +2398,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) ...@@ -2398,9 +2398,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r) if (r)
goto init_failed; goto init_failed;
if (adev->gmc.xgmi.num_physical_nodes > 1) if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive;
amdgpu_xgmi_add_device(adev); amdgpu_xgmi_add_device(adev);
hive = amdgpu_get_xgmi_hive(adev);
if (!hive || !hive->reset_domain.wq) {
DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
r = -EINVAL;
goto init_failed;
}
adev->reset_domain.wq = hive->reset_domain.wq;
} else {
adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
if (!adev->reset_domain.wq) {
r = -ENOMEM;
goto init_failed;
}
}
/* Don't init kfd if whole hive need to be reset during init */ /* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset) if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev); amdgpu_amdkfd_device_init(adev);
......
...@@ -398,6 +398,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) ...@@ -398,6 +398,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
goto pro_end; goto pro_end;
} }
hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
if (!hive->reset_domain.wq) {
dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
kfree(hive);
hive = NULL;
goto pro_end;
}
hive->hive_id = adev->gmc.xgmi.hive_id; hive->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->device_list);
INIT_LIST_HEAD(&hive->node); INIT_LIST_HEAD(&hive->node);
...@@ -407,6 +415,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) ...@@ -407,6 +415,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
task_barrier_init(&hive->tb); task_barrier_init(&hive->tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
hive->hi_req_gpu = NULL; hive->hi_req_gpu = NULL;
/* /*
* hive pstate on boot is high in vega20 so we have to go to low * hive pstate on boot is high in vega20 so we have to go to low
* pstate on after boot. * pstate on after boot.
......
...@@ -42,6 +42,8 @@ struct amdgpu_hive_info { ...@@ -42,6 +42,8 @@ struct amdgpu_hive_info {
AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_MAX_VEGA20,
AMDGPU_XGMI_PSTATE_UNKNOWN AMDGPU_XGMI_PSTATE_UNKNOWN
} pstate; } pstate;
struct amdgpu_reset_domain reset_domain;
}; };
struct amdgpu_pcs_ras_field { struct amdgpu_pcs_ras_field {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment