Commit d9de5ce8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - Add a driver for the RAS functionality on Xilinx's on chip memory
   controller

 - Add support for decoding errors from the first and second level
   memory on SKL-based hardware

 - Add support for the memory controllers in Intel Granite Rapids and
   Emerald Rapids machines

 - First round of amd64_edac driver simplification and removal of
   unneeded functionality

 - The usual cleanups and fixes

* tag 'edac_updates_for_v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  EDAC/amd64: Shut up an -Werror,-Wsometimes-uninitialized clang false positive
  EDAC/amd64: Remove early_channel_count()
  EDAC/amd64: Remove PCI Function 0
  EDAC/amd64: Remove PCI Function 6
  EDAC/amd64: Remove scrub rate control for Family 17h and later
  EDAC/amd64: Don't set up EDAC PCI control on Family 17h+
  EDAC/i10nm: Add driver decoder for Sapphire Rapids server
  EDAC/i10nm: Add Intel Granite Rapids server support
  EDAC/i10nm: Make more configurations CPU model specific
  EDAC/i10nm: Add Intel Emerald Rapids server support
  EDAC/skx_common: Delete duplicated and unreachable code
  EDAC/skx_common: Enable EDAC support for the "near" memory
  EDAC/qcom: Add platform_device_id table for module autoloading
  EDAC/zynqmp: Add EDAC support for Xilinx ZynqMP OCM
  dt-bindings: edac: Add bindings for Xilinx ZynqMP OCM
parents 0246725d 28980db9
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Xilinx Zynqmp OCM(On-Chip Memory) Controller
maintainers:
- Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
- Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
description: |
The OCM supports 64-bit wide ECC functionality to detect multi-bit errors
and recover from a single-bit memory fault.On a write, if all bytes are
being written, the ECC is generated and written into the ECC RAM along with
the write-data that is written into the data RAM. If one or more bytes are
not written, then the read operation results in an correctable error or
uncorrectable error.
properties:
compatible:
const: xlnx,zynqmp-ocmc-1.0
reg:
maxItems: 1
interrupts:
maxItems: 1
required:
- compatible
- reg
- interrupts
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
memory-controller@ff960000 {
compatible = "xlnx,zynqmp-ocmc-1.0";
reg = <0xff960000 0x1000>;
interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
};
...@@ -22743,6 +22743,13 @@ F: Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml ...@@ -22743,6 +22743,13 @@ F: Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml
F: drivers/dma/xilinx/xilinx_dpdma.c F: drivers/dma/xilinx/xilinx_dpdma.c
F: include/dt-bindings/dma/xlnx-zynqmp-dpdma.h F: include/dt-bindings/dma/xlnx-zynqmp-dpdma.h
XILINX ZYNQMP OCM EDAC DRIVER
M: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
M: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
S: Maintained
F: Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml
F: drivers/edac/zynqmp_edac.c
XILINX ZYNQMP PSGTR PHY DRIVER XILINX ZYNQMP PSGTR PHY DRIVER
M: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com> M: Anurag Kumar Vulisha <anurag.kumar.vulisha@xilinx.com>
M: Laurent Pinchart <laurent.pinchart@ideasonboard.com> M: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
......
...@@ -542,4 +542,12 @@ config EDAC_DMC520 ...@@ -542,4 +542,12 @@ config EDAC_DMC520
Support for error detection and correction on the Support for error detection and correction on the
SoCs with ARM DMC-520 DRAM controller. SoCs with ARM DMC-520 DRAM controller.
config EDAC_ZYNQMP
tristate "Xilinx ZynqMP OCM Controller"
depends on ARCH_ZYNQMP || COMPILE_TEST
help
This driver supports error detection and correction for the
Xilinx ZynqMP OCM (On Chip Memory) controller. It can also be
built as a module. In that case it will be called zynqmp_edac.
endif # EDAC endif # EDAC
...@@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o ...@@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o
obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o
obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o
obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o
This diff is collapsed.
...@@ -114,22 +114,6 @@ ...@@ -114,22 +114,6 @@
#define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532 #define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582
#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460
#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446
#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650
#define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670
/* /*
* Function 1 - Address Map * Function 1 - Address Map
...@@ -215,8 +199,6 @@ ...@@ -215,8 +199,6 @@
#define DCT_SEL_HI 0x114 #define DCT_SEL_HI 0x114
#define F15H_M60H_SCRCTRL 0x1C8 #define F15H_M60H_SCRCTRL 0x1C8
#define F17H_SCR_BASE_ADDR 0x48
#define F17H_SCR_LIMIT_ADDR 0x4C
/* /*
* Function 3 - Misc Control * Function 3 - Misc Control
...@@ -356,7 +338,7 @@ struct amd64_pvt { ...@@ -356,7 +338,7 @@ struct amd64_pvt {
struct low_ops *ops; struct low_ops *ops;
/* pci_device handles which we utilize */ /* pci_device handles which we utilize */
struct pci_dev *F0, *F1, *F2, *F3, *F6; struct pci_dev *F1, *F2, *F3;
u16 mc_node_id; /* MC index of this MC node */ u16 mc_node_id; /* MC index of this MC node */
u8 fam; /* CPU family */ u8 fam; /* CPU family */
...@@ -364,7 +346,6 @@ struct amd64_pvt { ...@@ -364,7 +346,6 @@ struct amd64_pvt {
u8 stepping; /* ... stepping */ u8 stepping; /* ... stepping */
int ext_model; /* extended model value of this node */ int ext_model; /* extended model value of this node */
int channel_count;
/* Raw registers */ /* Raw registers */
u32 dclr0; /* DRAM Configuration Low DCT0 reg */ u32 dclr0; /* DRAM Configuration Low DCT0 reg */
...@@ -484,7 +465,6 @@ struct ecc_settings { ...@@ -484,7 +465,6 @@ struct ecc_settings {
* functions and per device encoding/decoding logic. * functions and per device encoding/decoding logic.
*/ */
struct low_ops { struct low_ops {
int (*early_channel_count) (struct amd64_pvt *pvt);
void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr,
struct err_info *); struct err_info *);
int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct,
...@@ -503,7 +483,7 @@ struct amd64_family_flags { ...@@ -503,7 +483,7 @@ struct amd64_family_flags {
struct amd64_family_type { struct amd64_family_type {
const char *ctl_name; const char *ctl_name;
u16 f0_id, f1_id, f2_id, f6_id; u16 f1_id, f2_id;
/* Maximum number of memory controllers per die/node. */ /* Maximum number of memory controllers per die/node. */
u8 max_mcs; u8 max_mcs;
struct amd64_family_flags flags; struct amd64_family_flags flags;
......
This diff is collapsed.
...@@ -396,12 +396,19 @@ static int qcom_llcc_edac_remove(struct platform_device *pdev) ...@@ -396,12 +396,19 @@ static int qcom_llcc_edac_remove(struct platform_device *pdev)
return 0; return 0;
} }
static const struct platform_device_id qcom_llcc_edac_id_table[] = {
{ .name = "qcom_llcc_edac" },
{}
};
MODULE_DEVICE_TABLE(platform, qcom_llcc_edac_id_table);
static struct platform_driver qcom_llcc_edac_driver = { static struct platform_driver qcom_llcc_edac_driver = {
.probe = qcom_llcc_edac_probe, .probe = qcom_llcc_edac_probe,
.remove = qcom_llcc_edac_remove, .remove = qcom_llcc_edac_remove,
.driver = { .driver = {
.name = "qcom_llcc_edac", .name = "qcom_llcc_edac",
}, },
.id_table = qcom_llcc_edac_id_table,
}; };
module_platform_driver(qcom_llcc_edac_driver); module_platform_driver(qcom_llcc_edac_driver);
......
...@@ -560,44 +560,28 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, ...@@ -560,44 +560,28 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
tp_event = HW_EVENT_ERR_CORRECTED; tp_event = HW_EVENT_ERR_CORRECTED;
} }
/* switch (optypenum) {
* According to Intel Architecture spec vol 3B, case 0:
* Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding" optype = "generic undef request error";
* memory errors should fit one of these masks: break;
* 000f 0000 1mmm cccc (binary) case 1:
* 000f 0010 1mmm cccc (binary) [RAM used as cache] optype = "memory read error";
* where: break;
* f = Correction Report Filtering Bit. If 1, subsequent errors case 2:
* won't be shown optype = "memory write error";
* mmm = error type break;
* cccc = channel case 3:
* If the mask doesn't match, report an error to the parsing logic optype = "addr/cmd error";
*/ break;
if (!((errcode & 0xef80) == 0x80 || (errcode & 0xef80) == 0x280)) { case 4:
optype = "Can't parse: it is not a mem"; optype = "memory scrubbing error";
} else { scrub_err = true;
switch (optypenum) { break;
case 0: default:
optype = "generic undef request error"; optype = "reserved";
break; break;
case 1:
optype = "memory read error";
break;
case 2:
optype = "memory write error";
break;
case 3:
optype = "addr/cmd error";
break;
case 4:
optype = "memory scrubbing error";
scrub_err = true;
break;
default:
optype = "reserved";
break;
}
} }
if (res->decoded_by_adxl) { if (res->decoded_by_adxl) {
len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
overflow ? " OVERFLOW" : "", overflow ? " OVERFLOW" : "",
...@@ -632,12 +616,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m) ...@@ -632,12 +616,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m)
if (!skx_mem_cfg_2lm) if (!skx_mem_cfg_2lm)
return false; return false;
errcode = GET_BITFIELD(m->status, 0, 15); errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
if ((errcode & 0xef80) != 0x280) return errcode == MCACOD_EXT_MEM_ERR;
return false; }
return true; static bool skx_error_in_mem(const struct mce *m)
{
u32 errcode;
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
} }
int skx_mce_check_error(struct notifier_block *nb, unsigned long val, int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
...@@ -651,8 +641,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, ...@@ -651,8 +641,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if (mce->kflags & MCE_HANDLED_CEC) if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE; return NOTIFY_DONE;
/* ignore unless this is memory related with an address */ /* Ignore unless this is memory related with an address */
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE; return NOTIFY_DONE;
memset(&res, 0, sizeof(res)); memset(&res, 0, sizeof(res));
......
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
#define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */
#define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */
#define I10NM_NUM_DDR_IMC 4 #define I10NM_NUM_DDR_IMC 12
#define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_CHANNELS 2
#define I10NM_NUM_DDR_DIMMS 2 #define I10NM_NUM_DDR_DIMMS 2
...@@ -56,6 +56,30 @@ ...@@ -56,6 +56,30 @@
#define MCI_MISC_ECC_MODE(m) (((m) >> 59) & 15) #define MCI_MISC_ECC_MODE(m) (((m) >> 59) & 15)
#define MCI_MISC_ECC_DDRT 8 /* read from DDRT */ #define MCI_MISC_ECC_DDRT 8 /* read from DDRT */
/*
* According to Intel Architecture spec vol 3B,
* Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
* memory errors should fit one of these masks:
* 000f 0000 1mmm cccc (binary)
* 000f 0010 1mmm cccc (binary) [RAM used as cache]
* where:
* f = Correction Report Filtering Bit. If 1, subsequent errors
* won't be shown
* mmm = error type
* cccc = channel
*/
#define MCACOD_MEM_ERR_MASK 0xef80
/*
* Errors from either the memory of the 1-level memory system or the
* 2nd level memory (the slow "far" memory) of the 2-level memory system.
*/
#define MCACOD_MEM_CTL_ERR 0x80
/*
* Errors from the 1st level memory (the fast "near" memory as cache)
* of the 2-level memory system.
*/
#define MCACOD_EXT_MEM_ERR 0x280
/* /*
* Each cpu socket contains some pci devices that provide global * Each cpu socket contains some pci devices that provide global
* information, and also some that are local to each of the two * information, and also some that are local to each of the two
...@@ -105,7 +129,8 @@ struct skx_pvt { ...@@ -105,7 +129,8 @@ struct skx_pvt {
enum type { enum type {
SKX, SKX,
I10NM, I10NM,
SPR SPR,
GNR
}; };
enum { enum {
...@@ -149,19 +174,47 @@ struct decoded_addr { ...@@ -149,19 +174,47 @@ struct decoded_addr {
bool decoded_by_adxl; bool decoded_by_adxl;
}; };
struct pci_bdf {
u32 bus : 8;
u32 dev : 5;
u32 fun : 3;
};
struct res_config { struct res_config {
enum type type; enum type type;
/* Configuration agent device ID */ /* Configuration agent device ID */
unsigned int decs_did; unsigned int decs_did;
/* Default bus number configuration register offset */ /* Default bus number configuration register offset */
int busno_cfg_offset; int busno_cfg_offset;
/* DDR memory controllers per socket */
int ddr_imc_num;
/* DDR channels per DDR memory controller */
int ddr_chan_num;
/* DDR DIMMs per DDR memory channel */
int ddr_dimm_num;
/* Per DDR channel memory-mapped I/O size */ /* Per DDR channel memory-mapped I/O size */
int ddr_chan_mmio_sz; int ddr_chan_mmio_sz;
/* HBM memory controllers per socket */
int hbm_imc_num;
/* HBM channels per HBM memory controller */
int hbm_chan_num;
/* HBM DIMMs per HBM memory channel */
int hbm_dimm_num;
/* Per HBM channel memory-mapped I/O size */ /* Per HBM channel memory-mapped I/O size */
int hbm_chan_mmio_sz; int hbm_chan_mmio_sz;
bool support_ddr5; bool support_ddr5;
/* SAD device number and function number */ /* SAD device BDF */
unsigned int sad_all_devfn; struct pci_bdf sad_all_bdf;
/* PCU device BDF */
struct pci_bdf pcu_cr3_bdf;
/* UTIL device BDF */
struct pci_bdf util_all_bdf;
/* URACU device BDF */
struct pci_bdf uracu_bdf;
/* DDR mdev device BDF */
struct pci_bdf ddr_mdev_bdf;
/* HBM mdev device BDF */
struct pci_bdf hbm_mdev_bdf;
int sad_all_offset; int sad_all_offset;
/* Offsets of retry_rd_err_log registers */ /* Offsets of retry_rd_err_log registers */
u32 *offsets_scrub; u32 *offsets_scrub;
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment