Commit 2c22ed0b authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: add instance mask for RAS inject

User can specify injected instances by the mask. For backward
compatibility, the mask value is incorporated into sub block index
without interface change of RAS TA.
User uses logical mask and driver should convert it to physical value
before sending it to RAS TA.

v2: update parameter name.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent af2ba368
...@@ -1672,14 +1672,33 @@ int psp_ras_initialize(struct psp_context *psp) ...@@ -1672,14 +1672,33 @@ int psp_ras_initialize(struct psp_context *psp)
} }
int psp_ras_trigger_error(struct psp_context *psp, int psp_ras_trigger_error(struct psp_context *psp,
struct ta_ras_trigger_error_input *info) struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
{ {
struct ta_ras_shared_memory *ras_cmd; struct ta_ras_shared_memory *ras_cmd;
struct amdgpu_device *adev = psp->adev;
int ret; int ret;
uint32_t dev_mask;
if (!psp->ras_context.context.initialized) if (!psp->ras_context.context.initialized)
return -EINVAL; return -EINVAL;
switch (info->block_id) {
case TA_RAS_BLOCK__GFX:
dev_mask = GET_MASK(GC, instance_mask);
break;
case TA_RAS_BLOCK__SDMA:
dev_mask = GET_MASK(SDMA0, instance_mask);
break;
default:
dev_mask = instance_mask;
break;
}
/* reuse sub_block_index for backward compatibility */
dev_mask <<= AMDGPU_RAS_INST_SHIFT;
dev_mask &= AMDGPU_RAS_INST_MASK;
info->sub_block_index |= dev_mask;
ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
......
...@@ -486,7 +486,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id); ...@@ -486,7 +486,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
int psp_ras_enable_features(struct psp_context *psp, int psp_ras_enable_features(struct psp_context *psp,
union ta_ras_cmd_input *info, bool enable); union ta_ras_cmd_input *info, bool enable);
int psp_ras_trigger_error(struct psp_context *psp, int psp_ras_trigger_error(struct psp_context *psp,
struct ta_ras_trigger_error_input *info); struct ta_ras_trigger_error_input *info, uint32_t instance_mask);
int psp_ras_terminate(struct psp_context *psp); int psp_ras_terminate(struct psp_context *psp);
int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id); int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
......
...@@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
int block_id; int block_id;
uint32_t sub_block; uint32_t sub_block;
u64 address, value; u64 address, value;
/* default value is 0 if the mask is not set by user */
u32 instance_mask = 0;
if (*pos) if (*pos)
return -EINVAL; return -EINVAL;
...@@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->op = op; data->op = op;
if (op == 2) { if (op == 2) {
if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
&sub_block, &address, &value, &instance_mask) != 4 &&
sscanf(str, "%*s %*s %*s %u %llu %llu %u",
&sub_block, &address, &value, &instance_mask) != 4 &&
sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
&sub_block, &address, &value) != 3 && &sub_block, &address, &value) != 3 &&
sscanf(str, "%*s %*s %*s %u %llu %llu", sscanf(str, "%*s %*s %*s %u %llu %llu",
&sub_block, &address, &value) != 3) &sub_block, &address, &value) != 3)
...@@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->head.sub_block_index = sub_block; data->head.sub_block_index = sub_block;
data->inject.address = address; data->inject.address = address;
data->inject.value = value; data->inject.value = value;
data->inject.instance_mask = instance_mask;
} }
} else { } else {
if (size < sizeof(*data)) if (size < sizeof(*data))
...@@ -341,7 +348,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -341,7 +348,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* sub_block_index: some IPs have subcomponets. say, GFX, sDMA. * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
* name: the name of IP. * name: the name of IP.
* *
* inject has two more members than head, they are address, value. * inject has three more members than head, they are address, value and mask.
* As their names indicate, inject operation will write the * As their names indicate, inject operation will write the
* value to the address. * value to the address.
* *
...@@ -365,7 +372,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -365,7 +372,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* *
* echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
* *
* Where N, is the card which you want to affect. * Where N, is the card which you want to affect.
* *
...@@ -382,13 +389,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, ...@@ -382,13 +389,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* *
* The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
* The address and value are hexadecimal numbers, leading 0x is optional. * The address and value are hexadecimal numbers, leading 0x is optional.
* The mask means instance mask, is optional, default value is 0x1.
* *
* For instance, * For instance,
* *
* .. code-block:: bash * .. code-block:: bash
* *
* echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
* *
* How to check the result of the operation? * How to check the result of the operation?
...@@ -1117,13 +1125,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ...@@ -1117,13 +1125,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
if (block_obj->hw_ops->ras_error_inject) if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, info); ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
} else { } else {
/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
if (block_obj->hw_ops->ras_error_inject) if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
info->instance_mask);
else /*If not defined .ras_error_inject, use default ras_error_inject*/ else /*If not defined .ras_error_inject, use default ras_error_inject*/
ret = psp_ras_trigger_error(&adev->psp, &block_info); ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
} }
if (ret) if (ret)
......
...@@ -32,6 +32,11 @@ ...@@ -32,6 +32,11 @@
struct amdgpu_iv_entry; struct amdgpu_iv_entry;
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
/* position of instance value in sub_block_index of
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
*/
#define AMDGPU_RAS_INST_MASK 0xfffff000
#define AMDGPU_RAS_INST_SHIFT 0xc
enum amdgpu_ras_block { enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__UMC = 0,
...@@ -508,6 +513,7 @@ struct ras_inject_if { ...@@ -508,6 +513,7 @@ struct ras_inject_if {
struct ras_common_if head; struct ras_common_if head;
uint64_t address; uint64_t address;
uint64_t value; uint64_t value;
uint32_t instance_mask;
}; };
struct ras_cure_if { struct ras_cure_if {
...@@ -545,7 +551,8 @@ struct amdgpu_ras_block_object { ...@@ -545,7 +551,8 @@ struct amdgpu_ras_block_object {
}; };
struct amdgpu_ras_block_hw_ops { struct amdgpu_ras_block_hw_ops {
int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); int (*ras_error_inject)(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask);
void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status);
void (*query_ras_error_status)(struct amdgpu_device *adev); void (*query_ras_error_status)(struct amdgpu_device *adev);
void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
......
...@@ -1014,7 +1014,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, ...@@ -1014,7 +1014,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
} }
/* Trigger XGMI/WAFL error */ /* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
{ {
int ret = 0; int ret = 0;
struct ta_ras_trigger_error_input *block_info = struct ta_ras_trigger_error_input *block_info =
...@@ -1026,7 +1027,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *injec ...@@ -1026,7 +1027,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *injec
if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
dev_warn(adev->dev, "Failed to disallow XGMI power down"); dev_warn(adev->dev, "Failed to disallow XGMI power down");
ret = psp_ras_trigger_error(&adev->psp, block_info); ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
if (amdgpu_ras_intr_triggered()) if (amdgpu_ras_intr_triggered())
return ret; return ret;
......
...@@ -770,7 +770,7 @@ static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); ...@@ -770,7 +770,7 @@ static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
void *inject_if); void *inject_if, uint32_t instance_mask);
static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev); static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);
static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring, static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
...@@ -6335,7 +6335,7 @@ static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = { ...@@ -6335,7 +6335,7 @@ static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = {
}; };
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
void *inject_if) void *inject_if, uint32_t instance_mask)
{ {
struct ras_inject_if *info = (struct ras_inject_if *)inject_if; struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret; int ret;
...@@ -6374,7 +6374,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, ...@@ -6374,7 +6374,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
block_info.value = info->value; block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex); mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info); ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex); mutex_unlock(&adev->grbm_idx_mutex);
return ret; return ret;
......
...@@ -971,7 +971,7 @@ static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev) ...@@ -971,7 +971,7 @@ static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
} }
static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
void *inject_if) void *inject_if, uint32_t instance_mask)
{ {
struct ras_inject_if *info = (struct ras_inject_if *)inject_if; struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret; int ret;
...@@ -987,7 +987,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, ...@@ -987,7 +987,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
block_info.value = info->value; block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex); mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info); ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex); mutex_unlock(&adev->grbm_idx_mutex);
return ret; return ret;
......
...@@ -1699,7 +1699,8 @@ static void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev) ...@@ -1699,7 +1699,8 @@ static void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev)
gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL); gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL);
} }
static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if) static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
{ {
struct ras_inject_if *info = (struct ras_inject_if *)inject_if; struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
int ret; int ret;
...@@ -1715,7 +1716,7 @@ static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_ ...@@ -1715,7 +1716,7 @@ static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_
block_info.value = info->value; block_info.value = info->value;
mutex_lock(&adev->grbm_idx_mutex); mutex_lock(&adev->grbm_idx_mutex);
ret = psp_ras_trigger_error(&adev->psp, &block_info); ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
mutex_unlock(&adev->grbm_idx_mutex); mutex_unlock(&adev->grbm_idx_mutex);
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment