EDAC/amd64: Add support for family 0x19, models 0x90-9f devices

AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support is enabled by default. APU models have a single Data Fabric (DF) per Package. Each DF is visible to the OS in the same way as chiplet-based systems like Zen2 CPUs and later. However, the Unified Memory Controllers (UMCs) are arranged in the same way as GPU-based MI200 devices rather than CPU-based systems. Use the existing gpu_ops for hetergeneous systems to support enumeration of nodes and memory topology with few fixups. [ bp: Massage comments. ] Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com

EDAC/amd64: Add support for family 0x19, models 0x90-9f devices
AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support is enabled by default. APU models have a single Data Fabric (DF) per Package. Each DF is visible to the OS in the same way as chiplet-based systems like Zen2 CPUs and later. However, the Unified Memory Controllers (UMCs) are arranged in the same way as GPU-based MI200 devices rather than CPU-based systems. Use the existing gpu_ops for hetergeneous systems to support enumeration of nodes and memory topology with few fixups. [ bp: Massage comments. ] Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com
12f230c0 · Muralidhara M K · Borislav Petkov (AMD) · 9a5f580c · 12f230c0 · 12f230c0
Commit 12f230c0 authored Nov 02, 2023 by Muralidhara M K Committed by Borislav Petkov (AMD) Nov 29, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 18 deletions

drivers/edac/amd64_edac.c drivers/edac/amd64_edac.c +48 -18

drivers/edac/amd64_edac.h drivers/edac/amd64_edac.h +1 -0

No files found.
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -996,15 +996,23 @@ static struct local_node_map {
 #define LNTM_NODE_COUNT				GENMASK(27, 16)
 #define LNTM_BASE_NODE_ID			GENMASK(11, 0)
-static int gpu_get_node_map(void)
+static int gpu_get_node_map(struct amd64_pvt *pvt)
 {
 	struct pci_dev *pdev;
 	int ret;
 	u32 tmp;
 	/*
-	 * Node ID 0 is reserved for CPUs.
+	 * Mapping of nodes from hardware-provided AMD Node ID to a
-	 * Therefore, a non-zero Node ID means we've already cached the values.
+	 * Linux logical one is applicable for MI200 models. Therefore,
+	 * return early for other heterogeneous systems.
+	 */
+	if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
+		return 0;
+	/*
+	 * Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID
+	 * means the values have been already cached.
 	 */
 	if (gpu_node_map.base_node_id)
 		return 0;
@@ -3851,7 +3859,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
 			dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
 			dimm->edac_mode = EDAC_SECDED;
-			dimm->mtype = MEM_HBM2;
+			dimm->mtype = pvt->dram_type;
 			dimm->dtype = DEV_X16;
 			dimm->grain = 64;
 		}
@@ -3880,7 +3888,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
 	return true;
 }
-static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
+static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel)
 {
 	/*
 	 * On CPUs, there is one channel per UMC, so UMC numbering equals
@@ -3893,13 +3901,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
 	 * On GPU nodes channels are selected in 3rd nibble
 	 * HBM chX[3:0]= [Y  ]5X[3:0]000;
 	 * HBM chX[7:4]= [Y+1]5X[3:0]000
+	 *
+	 * On MI300 APU nodes, same as GPU nodes but channels are selected
+	 * in the base address of 0x90000
 	 */
 	umc *= 2;
 	if (channel >= 4)
 		umc++;
-	return 0x50000 + (umc << 20) + ((channel % 4) << 12);
+	return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
 }
 static void gpu_read_mc_regs(struct amd64_pvt *pvt)
@@ -3910,7 +3921,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt)
 	/* Read registers from each UMC */
 	for_each_umc(i) {
-		umc_base = gpu_get_umc_base(i, 0);
+		umc_base = gpu_get_umc_base(pvt, i, 0);
 		umc = &pvt->umc[i];
 		amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg);
@@ -3927,7 +3938,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
 	for_each_umc(umc) {
 		for_each_chip_select(cs, umc, pvt) {
-			base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR;
+			base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR;
 			base = &pvt->csels[umc].csbases[cs];
 			if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) {
@@ -3935,7 +3946,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt)
 					 umc, cs, *base, base_reg);
 			}
-			mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK;
+			mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK;
 			mask = &pvt->csels[umc].csmasks[cs];
 			if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) {
@@ -3960,7 +3971,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
 {
 	int ret;
-	ret = gpu_get_node_map();
+	ret = gpu_get_node_map(pvt);
 	if (ret)
 		return ret;
@@ -4125,6 +4136,8 @@ static int per_family_init(struct amd64_pvt *pvt)
 			if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) {
 				pvt->ctl_name		= "MI200";
 				pvt->max_mcs		= 4;
+				pvt->dram_type		= MEM_HBM2;
+				pvt->gpu_umc_base	= 0x50000;
 				pvt->ops		= &gpu_ops;
 			} else {
 				pvt->ctl_name		= "F19h_M30h";
@@ -4142,6 +4155,13 @@ static int per_family_init(struct amd64_pvt *pvt)
 			pvt->ctl_name			= "F19h_M70h";
 			pvt->flags.zn_regs_v2		= 1;
 			break;
+		case 0x90 ... 0x9f:
+			pvt->ctl_name			= "F19h_M90h";
+			pvt->max_mcs			= 4;
+			pvt->dram_type			= MEM_HBM3;
+			pvt->gpu_umc_base		= 0x90000;
+			pvt->ops			= &gpu_ops;
+			break;
 		case 0xa0 ... 0xaf:
 			pvt->ctl_name			= "F19h_MA0h";
 			pvt->max_mcs			= 12;
@@ -4180,23 +4200,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
 	NULL
 };
+/*
+ * For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
+ * should be swapped to fit into the layers.
+ */
+static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
+{
+	bool is_gpu = (pvt->ops == &gpu_ops);
+	if (!layer)
+		return is_gpu ? pvt->max_mcs
+			      : pvt->csels[0].b_cnt;
+	else
+		return is_gpu ? pvt->csels[0].b_cnt
+			      : pvt->max_mcs;
+}
 static int init_one_instance(struct amd64_pvt *pvt)
 {
 	struct mem_ctl_info *mci = NULL;
 	struct edac_mc_layer layers[2];
 	int ret = -ENOMEM;
-	/*
-	 * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should
-	 * be swapped to fit into the layers.
-	 */
 	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
-	layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
+	layers[0].size = get_layer_size(pvt, 0);
-			 pvt->max_mcs : pvt->csels[0].b_cnt;
 	layers[0].is_virt_csrow = true;
 	layers[1].type = EDAC_MC_LAYER_CHANNEL;
-	layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
+	layers[1].size = get_layer_size(pvt, 1);
-			 pvt->csels[0].b_cnt : pvt->max_mcs;
 	layers[1].is_virt_csrow = false;
 	mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);

--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -362,6 +362,7 @@ struct amd64_pvt {
 	u32 dct_sel_lo;		/* DRAM Controller Select Low */
 	u32 dct_sel_hi;		/* DRAM Controller Select High */
 	u32 online_spare;	/* On-Line spare Reg */
+	u32 gpu_umc_base;	/* Base address used for channel selection on GPUs */
 	/* x4, x8, or x16 syndromes in use */
 	u8 ecc_sym_sz;