Commit e23b74aa authored by Alex Deucher's avatar Alex Deucher

drm/amdgpu: fix vf error handling

The error handling for virtual functions assumed a single
vf per VM and didn't properly account for bare metal.  Make
the error arrays per device and add locking.
Reviewed-by: default avatarGavin Wan <gavin.wan@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6f87a895
...@@ -2040,6 +2040,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2040,6 +2040,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->srbm_mutex); mutex_init(&adev->srbm_mutex);
mutex_init(&adev->grbm_idx_mutex); mutex_init(&adev->grbm_idx_mutex);
mutex_init(&adev->mn_lock); mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash); hash_init(adev->mn_hash);
amdgpu_check_arguments(adev); amdgpu_check_arguments(adev);
...@@ -2125,7 +2126,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2125,7 +2126,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_atombios_init(adev); r = amdgpu_atombios_init(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_atombios_init failed\n"); dev_err(adev->dev, "amdgpu_atombios_init failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
goto failed; goto failed;
} }
...@@ -2136,7 +2137,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2136,7 +2137,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_vpost_needed(adev)) { if (amdgpu_vpost_needed(adev)) {
if (!adev->bios) { if (!adev->bios) {
dev_err(adev->dev, "no vBIOS found\n"); dev_err(adev->dev, "no vBIOS found\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
r = -EINVAL; r = -EINVAL;
goto failed; goto failed;
} }
...@@ -2144,7 +2145,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2144,7 +2145,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_atom_asic_init(adev->mode_info.atom_context); r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
if (r) { if (r) {
dev_err(adev->dev, "gpu post error!\n"); dev_err(adev->dev, "gpu post error!\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
goto failed; goto failed;
} }
} else { } else {
...@@ -2156,7 +2157,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2156,7 +2157,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_atomfirmware_get_clock_info(adev); r = amdgpu_atomfirmware_get_clock_info(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
goto failed; goto failed;
} }
} else { } else {
...@@ -2164,7 +2165,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2164,7 +2165,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_atombios_get_clock_info(adev); r = amdgpu_atombios_get_clock_info(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
goto failed; goto failed;
} }
/* init i2c buses */ /* init i2c buses */
...@@ -2175,7 +2176,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2175,7 +2176,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_fence_driver_init(adev); r = amdgpu_fence_driver_init(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
goto failed; goto failed;
} }
...@@ -2185,7 +2186,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2185,7 +2186,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_init(adev); r = amdgpu_init(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_init failed\n"); dev_err(adev->dev, "amdgpu_init failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
amdgpu_fini(adev); amdgpu_fini(adev);
goto failed; goto failed;
} }
...@@ -2205,7 +2206,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2205,7 +2206,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_ib_pool_init(adev); r = amdgpu_ib_pool_init(adev);
if (r) { if (r) {
dev_err(adev->dev, "IB initialization failed (%d).\n", r); dev_err(adev->dev, "IB initialization failed (%d).\n", r);
amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
goto failed; goto failed;
} }
...@@ -2254,7 +2255,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2254,7 +2255,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_late_init(adev); r = amdgpu_late_init(adev);
if (r) { if (r) {
dev_err(adev->dev, "amdgpu_late_init failed\n"); dev_err(adev->dev, "amdgpu_late_init failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
goto failed; goto failed;
} }
...@@ -2936,7 +2937,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) ...@@ -2936,7 +2937,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
} }
} else { } else {
dev_err(adev->dev, "asic resume failed (%d).\n", r); dev_err(adev->dev, "asic resume failed (%d).\n", r);
amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
if (adev->rings[i] && adev->rings[i]->sched.thread) { if (adev->rings[i] && adev->rings[i]->sched.thread) {
kthread_unpark(adev->rings[i]->sched.thread); kthread_unpark(adev->rings[i]->sched.thread);
...@@ -2950,7 +2951,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) ...@@ -2950,7 +2951,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
if (r) { if (r) {
/* bad news, how to tell it to userspace ? */ /* bad news, how to tell it to userspace ? */
dev_info(adev->dev, "GPU reset failed\n"); dev_info(adev->dev, "GPU reset failed\n");
amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} }
else { else {
dev_info(adev->dev, "GPU reset successed!\n"); dev_info(adev->dev, "GPU reset successed!\n");
......
...@@ -25,30 +25,21 @@ ...@@ -25,30 +25,21 @@
#include "amdgpu_vf_error.h" #include "amdgpu_vf_error.h"
#include "mxgpu_ai.h" #include "mxgpu_ai.h"
#define AMDGPU_VF_ERROR_ENTRY_SIZE 16 void amdgpu_vf_error_put(struct amdgpu_device *adev,
uint16_t sub_error_code,
/* struct error_entry - amdgpu VF error information. */ uint16_t error_flags,
struct amdgpu_vf_error_buffer { uint64_t error_data)
int read_count;
int write_count;
uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
};
struct amdgpu_vf_error_buffer admgpu_vf_errors;
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
{ {
int index; int index;
uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; mutex_lock(&adev->virt.vf_errors.lock);
admgpu_vf_errors.code [index] = error_code; index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
admgpu_vf_errors.flags [index] = error_flags; adev->virt.vf_errors.code [index] = error_code;
admgpu_vf_errors.data [index] = error_data; adev->virt.vf_errors.flags [index] = error_flags;
admgpu_vf_errors.write_count ++; adev->virt.vf_errors.data [index] = error_data;
adev->virt.vf_errors.write_count ++;
mutex_unlock(&adev->virt.vf_errors.lock);
} }
...@@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) ...@@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
u32 data1, data2, data3; u32 data1, data2, data3;
int index; int index;
if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) { if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) ||
(!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
return; return;
} }
/* /*
...@@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) ...@@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
return; return;
} }
*/ */
mutex_lock(&adev->virt.vf_errors.lock);
/* The errors are overlay of array, correct read_count as full. */ /* The errors are overlay of array, correct read_count as full. */
if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) { if (adev->virt.vf_errors.write_count - adev->virt.vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE; adev->virt.vf_errors.read_count = adev->virt.vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
} }
while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) { while (adev->virt.vf_errors.read_count < adev->virt.vf_errors.write_count) {
index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE; index =adev->virt.vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]); data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(adev->virt.vf_errors.code[index],
data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF; adev->virt.vf_errors.flags[index]);
data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF; data2 = adev->virt.vf_errors.data[index] & 0xFFFFFFFF;
data3 = (adev->virt.vf_errors.data[index] >> 32) & 0xFFFFFFFF;
adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3); adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
admgpu_vf_errors.read_count ++; adev->virt.vf_errors.read_count ++;
} }
mutex_unlock(&adev->virt.vf_errors.lock);
} }
...@@ -56,7 +56,10 @@ enum AMDGIM_ERROR_CATEGORY { ...@@ -56,7 +56,10 @@ enum AMDGIM_ERROR_CATEGORY {
AMDGIM_ERROR_CATEGORY_MAX AMDGIM_ERROR_CATEGORY_MAX
}; };
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data); void amdgpu_vf_error_put(struct amdgpu_device *adev,
uint16_t sub_error_code,
uint16_t error_flags,
uint64_t error_data);
void amdgpu_vf_error_trans_all (struct amdgpu_device *adev); void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);
#endif /* __VF_ERROR_H__ */ #endif /* __VF_ERROR_H__ */
...@@ -36,6 +36,18 @@ struct amdgpu_mm_table { ...@@ -36,6 +36,18 @@ struct amdgpu_mm_table {
uint64_t gpu_addr; uint64_t gpu_addr;
}; };
#define AMDGPU_VF_ERROR_ENTRY_SIZE 16
/* struct error_entry - amdgpu VF error information. */
struct amdgpu_vf_error_buffer {
struct mutex lock;
int read_count;
int write_count;
uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
};
/** /**
* struct amdgpu_virt_ops - amdgpu device virt operations * struct amdgpu_virt_ops - amdgpu device virt operations
*/ */
...@@ -59,6 +71,7 @@ struct amdgpu_virt { ...@@ -59,6 +71,7 @@ struct amdgpu_virt {
struct work_struct flr_work; struct work_struct flr_work;
struct amdgpu_mm_table mm_table; struct amdgpu_mm_table mm_table;
const struct amdgpu_virt_ops *ops; const struct amdgpu_virt_ops *ops;
struct amdgpu_vf_error_buffer vf_errors;
}; };
#define AMDGPU_CSA_SIZE (8 * 1024) #define AMDGPU_CSA_SIZE (8 * 1024)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment