drm/amdgpu: allow query error counters for specific IP block

amdgpu_ras_block_late_init will be invoked in IP specific ras_late_init call as a common helper for all the IP blocks. However, when amdgpu_ras_block_late_init call amdgpu_ras_query_error_count to query ras error counters, amdgpu_ras_query_error_count queries all the IP blocks that support ras query interface. This results to wrong error counters cached in software copies when there are ras errors detected at time zero or warm reset procedure. i.e., in sdma_ras_late_init phase, it counts on sdma/mmhub errors, while, in mmhub_ras_late_init phase, it still counts on sdma/mmhub errors. The change updates amdgpu_ras_query_error_count interface to allow query specific ip error counter. It introduces a new input parameter: query_info. if query_info is NULL, it means query all the IP blocks, otherwise, only query the ip block specified by query_info. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: allow query error counters for specific IP block
amdgpu_ras_block_late_init will be invoked in IP specific ras_late_init call as a common helper for all the IP blocks. However, when amdgpu_ras_block_late_init call amdgpu_ras_query_error_count to query ras error counters, amdgpu_ras_query_error_count queries all the IP blocks that support ras query interface. This results to wrong error counters cached in software copies when there are ras errors detected at time zero or warm reset procedure. i.e., in sdma_ras_late_init phase, it counts on sdma/mmhub errors, while, in mmhub_ras_late_init phase, it still counts on sdma/mmhub errors. The change updates amdgpu_ras_query_error_count interface to allow query specific ip error counter. It introduces a new input parameter: query_info. if query_info is NULL, it means query all the IP blocks, otherwise, only query the ip block specified by query_info. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
4a1c9a44 · Hawking Zhang · Alex Deucher · 90f56611 · 4a1c9a44 · 4a1c9a44
Commit 4a1c9a44 authored Jan 03, 2023 by Hawking Zhang Committed by Alex Deucher Jan 05, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 71 additions and 21 deletions

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +69 -20

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +2 -1

No files found.
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1136,11 +1136,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }

 /**
- * amdgpu_ras_query_error_count -- Get error counts of all IPs
+ * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
+ * @adev: pointer to AMD GPU device
+ * @ce_count: pointer to an integer to be set to the count of correctible errors.
+ * @ue_count: pointer to an integer to be set to the count of uncorrectible errors.
+ * @query_info: pointer to ras_query_if
+ *
+ * Return 0 for query success or do nothing, otherwise return an error
+ * on failures
+ */
+static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
+					       unsigned long *ce_count,
+					       unsigned long *ue_count,
+					       struct ras_query_if *query_info)
+{
+	int ret;
+
+	if (!query_info)
+		/* do nothing if query_info is not specified */
+		return 0;
+
+	ret = amdgpu_ras_query_error_status(adev, query_info);
+	if (ret)
+		return ret;
+
+	*ce_count += query_info->ce_count;
+	*ue_count += query_info->ue_count;
+
+	/* some hardware/IP supports read to clear
+	 * no need to explictly reset the err status after the query call */
+	if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+	    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+		if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
+			dev_warn(adev->dev,
+				 "Failed to reset error counter and error status\n");
+	}
+
+	return 0;
+}
+
+/**
+ * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
 * @adev: pointer to AMD GPU device
 * @ce_count: pointer to an integer to be set to the count of correctible errors.
 * @ue_count: pointer to an integer to be set to the count of uncorrectible
 * errors.
+ * @query_info: pointer to ras_query_if if the query request is only for
+ * specific ip block; if info is NULL, then the qurey request is for
+ * all the ip blocks that support query ras error counters/status
 *
 * If set, @ce_count or @ue_count, count and return the corresponding
 * error counts in those integer pointers. Return 0 if the device
@@ -1148,11 +1191,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 */
 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 				 unsigned long *ce_count,
-				 unsigned long *ue_count)
+				 unsigned long *ue_count,
+				 struct ras_query_if *query_info)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_manager *obj;
 	unsigned long ce, ue;
+	int ret;

 	if (!adev->ras_enabled || !con)
 		return -EOPNOTSUPP;
@@ -1164,26 +1209,23 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,

 	ce = 0;
 	ue = 0;
-	list_for_each_entry(obj, &con->head, node) {
-		struct ras_query_if info = {
-			.head = obj->head,
-		};
-		int res;
-
-		res = amdgpu_ras_query_error_status(adev, &info);
-		if (res)
-			return res;
+	if (!query_info) {
+		/* query all the ip blocks that support ras query interface */
+		list_for_each_entry(obj, &con->head, node) {
+			struct ras_query_if info = {
+				.head = obj->head,
+			};

-		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
-		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
-			if (amdgpu_ras_reset_error_status(adev, info.head.block))
-				dev_warn(adev->dev, "Failed to reset error counter and error status");
+			ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info);
 		}
-
-		ce += info.ce_count;
-		ue += info.ue_count;
+	} else {
+		/* query specific ip block */
+		ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info);
 	}

+	if (ret)
+		return ret;
+
 	if (ce_count)
 		*ce_count = ce;

@@ -2411,7 +2453,7 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)

 	/* Cache new values.
 	 */
-	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) {
 		atomic_set(&con->ras_ce_count, ce_count);
 		atomic_set(&con->ras_ue_count, ue_count);
 	}
@@ -2592,6 +2634,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras_block_object *ras_obj = NULL;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_query_if *query_info;
 	unsigned long ue_count, ce_count;
 	int r;

@@ -2633,11 +2676,17 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,

 	/* Those are the cached values at init.
 	 */
-	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
+	query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
+	if (!query_info)
+		return -ENOMEM;
+	memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
+
+	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
 		atomic_set(&con->ras_ce_count, ce_count);
 		atomic_set(&con->ras_ue_count, ue_count);
 	}

+	kfree(query_info);
 	return 0;

 interrupt:

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -540,7 +540,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);

 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 				 unsigned long *ce_count,
-				 unsigned long *ue_count);
+				 unsigned long *ue_count,
+				 struct ras_query_if *query_info);

 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,