Commit 75ac6a25 authored by Yang Wang's avatar Yang Wang Committed by Alex Deucher

drm/amdgpu: refine amdgpu ras event id core code

v1:
- use unified event id to manage ras events
- add a new function amdgpu_ras_query_error_status_with_event() to accept
  event type as parameter.

v2:
add a warn log to show the location of function failure
when calling amdgpu_ras_mark_event(). (Tao Zhou)

v3:
change RAS_EVENT_TYPE_ISR to RAS_EVENT_TYPE_FATAL.

v4:
rename amdgpu_ras_get_recovery_event() to
amdgpu_ras_get_fatal_error_event().
Signed-off-by: default avatarYang Wang <kevinyang.wang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e3369714
...@@ -119,7 +119,7 @@ static struct aca_regs_dump { ...@@ -119,7 +119,7 @@ static struct aca_regs_dump {
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
struct ras_query_context *qctx) struct ras_query_context *qctx)
{ {
u64 event_id = qctx ? qctx->event_id : 0ULL; u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
int i; int i;
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
......
...@@ -274,7 +274,7 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) ...@@ -274,7 +274,7 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
struct ras_query_context *qctx) struct ras_query_context *qctx)
{ {
u64 event_id = qctx->event_id; u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
...@@ -543,7 +543,7 @@ static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type) ...@@ -543,7 +543,7 @@ static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
amdgpu_mca_bank_set_init(&mca_set); amdgpu_mca_bank_set_init(&mca_set);
qctx.event_id = 0ULL; qctx.evid.event_id = RAS_EVENT_INVALID_ID;
ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx); ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
if (ret) if (ret)
goto err_free_mca_set; goto err_free_mca_set;
......
...@@ -1055,7 +1055,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, ...@@ -1055,7 +1055,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct amdgpu_smuio_mcm_config_info *mcm_info; struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node; struct ras_err_node *err_node;
struct ras_err_info *err_info; struct ras_err_info *err_info;
u64 event_id = qctx->event_id; u64 event_id = qctx->evid.event_id;
if (is_ue) { if (is_ue) {
for_each_ras_error(err_node, err_data) { for_each_ras_error(err_node, err_data) {
...@@ -1140,7 +1140,7 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, ...@@ -1140,7 +1140,7 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
{ {
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
const char *blk_name = get_ras_block_str(&query_if->head); const char *blk_name = get_ras_block_str(&query_if->head);
u64 event_id = qctx->event_id; u64 event_id = qctx->evid.event_id;
if (err_data->ce_count) { if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) { if (err_data_has_source_info(err_data)) {
...@@ -1366,7 +1366,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, ...@@ -1366,7 +1366,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
} }
/* query/inject/cure begin */ /* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
struct ras_query_if *info,
enum ras_event_type type)
{ {
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data; struct ras_err_data err_data;
...@@ -1385,8 +1387,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i ...@@ -1385,8 +1387,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
return -EINVAL; return -EINVAL;
memset(&qctx, 0, sizeof(qctx)); memset(&qctx, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? qctx.evid.type = type;
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type);
if (!down_read_trylock(&adev->reset_domain->sem)) { if (!down_read_trylock(&adev->reset_domain->sem)) {
ret = -EIO; ret = -EIO;
...@@ -1415,6 +1417,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i ...@@ -1415,6 +1417,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
return ret; return ret;
} }
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{
return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
}
int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
enum amdgpu_ras_block block) enum amdgpu_ras_block block)
{ {
...@@ -2305,7 +2312,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) ...@@ -2305,7 +2312,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
/* ih end */ /* ih end */
/* traversal all IPs except NBIO to query error counter */ /* traversal all IPs except NBIO to query error counter */
static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj; struct ras_manager *obj;
...@@ -2338,7 +2345,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) ...@@ -2338,7 +2345,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
IP_VERSION(13, 0, 2))) IP_VERSION(13, 0, 2)))
continue; continue;
amdgpu_ras_query_error_status(adev, &info); amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (amdgpu_ip_version(adev, MP0_HWIP, 0) != if (amdgpu_ip_version(adev, MP0_HWIP, 0) !=
IP_VERSION(11, 0, 2) && IP_VERSION(11, 0, 2) &&
...@@ -2477,6 +2484,14 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) ...@@ -2477,6 +2484,14 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
return false; return false;
} }
static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev)
{
if (amdgpu_ras_intr_triggered())
return RAS_EVENT_TYPE_FATAL;
else
return RAS_EVENT_TYPE_INVALID;
}
static void amdgpu_ras_do_recovery(struct work_struct *work) static void amdgpu_ras_do_recovery(struct work_struct *work)
{ {
struct amdgpu_ras *ras = struct amdgpu_ras *ras =
...@@ -2485,6 +2500,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ...@@ -2485,6 +2500,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_device *adev = ras->adev; struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL; struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
enum ras_event_type type;
if (hive) { if (hive) {
atomic_set(&hive->ras_recovery, 1); atomic_set(&hive->ras_recovery, 1);
...@@ -2512,10 +2528,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ...@@ -2512,10 +2528,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
device_list_handle = &device_list; device_list_handle = &device_list;
} }
type = amdgpu_ras_get_fatal_error_event(adev);
list_for_each_entry(remote_adev, list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) { device_list_handle, gmc.xgmi.head) {
amdgpu_ras_query_err_status(remote_adev); amdgpu_ras_query_err_status(remote_adev);
amdgpu_ras_log_on_err_counter(remote_adev); amdgpu_ras_log_on_err_counter(remote_adev, type);
} }
} }
...@@ -3406,8 +3423,11 @@ static void ras_event_mgr_init(struct ras_event_manager *mgr) ...@@ -3406,8 +3423,11 @@ static void ras_event_mgr_init(struct ras_event_manager *mgr)
{ {
int i; int i;
for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) memset(mgr, 0, sizeof(*mgr));
atomic64_set(&mgr->seqnos[i], 0); atomic64_set(&mgr->seqno, 0);
for (i = 0; i < ARRAY_SIZE(mgr->last_seqno); i++)
mgr->last_seqno[i] = RAS_EVENT_INVALID_ID;
} }
static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
...@@ -3907,23 +3927,63 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) ...@@ -3907,23 +3927,63 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
atomic_set(&ras->fed, !!status); atomic_set(&ras->fed, !!status);
} }
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
{ {
return !(id & BIT_ULL(63)); struct amdgpu_ras *ras;
ras = amdgpu_ras_get_context(adev);
if (!ras)
return NULL;
return ras->event_mgr;
}
int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
const void *caller)
{
struct ras_event_manager *event_mgr;
int ret = 0;
if (type >= RAS_EVENT_TYPE_COUNT) {
ret = -EINVAL;
goto out;
}
event_mgr = __get_ras_event_mgr(adev);
if (!event_mgr) {
ret = -EINVAL;
goto out;
}
event_mgr->last_seqno[type] = atomic64_inc_return(&event_mgr->seqno);
out:
if (ret && caller)
dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n",
(int)type, caller, ret);
return ret;
} }
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
{ {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct ras_event_manager *event_mgr;
u64 id; u64 id;
if (type >= RAS_EVENT_TYPE_COUNT)
return RAS_EVENT_INVALID_ID;
switch (type) { switch (type) {
case RAS_EVENT_TYPE_ISR: case RAS_EVENT_TYPE_FATAL:
id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); event_mgr = __get_ras_event_mgr(adev);
if (!event_mgr)
return RAS_EVENT_INVALID_ID;
id = event_mgr->last_seqno[type];
break; break;
case RAS_EVENT_TYPE_INVALID: case RAS_EVENT_TYPE_INVALID:
default: default:
id = BIT_ULL(63) | 0ULL; id = RAS_EVENT_INVALID_ID;
break; break;
} }
...@@ -3934,7 +3994,13 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) ...@@ -3934,7 +3994,13 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{ {
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
u64 event_id;
if (amdgpu_ras_mark_ras_event(adev, type))
return;
event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n"); "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
...@@ -4668,7 +4734,7 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, ...@@ -4668,7 +4734,7 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
vaf.fmt = fmt; vaf.fmt = fmt;
vaf.va = &args; vaf.va = &args;
if (amdgpu_ras_event_id_is_valid(adev, event_id)) if (RAS_EVENT_ID_IS_VALID(event_id))
dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf);
else else
dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); dev_printk(KERN_INFO, adev->dev, "%pV", &vaf);
......
...@@ -68,9 +68,15 @@ struct amdgpu_iv_entry; ...@@ -68,9 +68,15 @@ struct amdgpu_iv_entry;
/* The high three bits indicates socketid */ /* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
#define RAS_EVENT_INVALID_ID (BIT_ULL(63))
#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63)))
#define RAS_EVENT_LOG(adev, id, fmt, ...) \ #define RAS_EVENT_LOG(adev, id, fmt, ...) \
amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__) amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__)
#define amdgpu_ras_mark_ras_event(adev, type) \
(amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0)))
enum amdgpu_ras_block { enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA, AMDGPU_RAS_BLOCK__SDMA,
...@@ -427,20 +433,25 @@ struct umc_ecc_info { ...@@ -427,20 +433,25 @@ struct umc_ecc_info {
}; };
enum ras_event_type { enum ras_event_type {
RAS_EVENT_TYPE_INVALID = -1, RAS_EVENT_TYPE_INVALID = 0,
RAS_EVENT_TYPE_ISR = 0, RAS_EVENT_TYPE_FATAL,
RAS_EVENT_TYPE_COUNT, RAS_EVENT_TYPE_COUNT,
}; };
struct ras_event_manager { struct ras_event_manager {
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; atomic64_t seqno;
u64 last_seqno[RAS_EVENT_TYPE_COUNT];
}; };
struct ras_query_context { struct ras_event_id {
enum ras_event_type type; enum ras_event_type type;
u64 event_id; u64 event_id;
}; };
struct ras_query_context {
struct ras_event_id evid;
};
typedef int (*pasid_notify)(struct amdgpu_device *adev, typedef int (*pasid_notify)(struct amdgpu_device *adev,
uint16_t pasid, void *data); uint16_t pasid, void *data);
...@@ -947,8 +958,9 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, ...@@ -947,8 +958,9 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
const void *caller);
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment