Commit aaf985e2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_updates_for_5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - A substantial edac_mc cleanup, sanitizing object freeing,
   streamlining and simplifying code flow, and getting rid of a lot of
   needless complexity in memory controller representation code, by
   Robert Richter.

 - A new EDAC driver for the ARM DMC-520 memory controller, by Lei Wang,
   Shiping Ji and others.

 - The usual sprinkling of misc cleanups and fixes all over the
   subsystem.

* tag 'edac_updates_for_5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  EDAC/armada_xp: Use scnprintf() for avoiding potential buffer overflow
  EDAC/synopsys: Do not dump uninitialized pinf->col
  EDAC: Add EDAC driver for DMC520
  dt-bindings: edac: Dmc-520.yaml
  EDAC/mce_amd: Print !SMCA processor warning only once
  EDAC/mc: Remove per layer counters
  EDAC/mc: Remove detail[] string and cleanup error string generation
  EDAC/mc: Pass the error descriptor to error reporting functions
  EDAC/mc: Remove enable_per_layer_report function argument
  EDAC/mc: Report "unknown memory" on too many DIMM labels found
  EDAC/mc: Carve out error increment into a separate function
  EDAC/mc: Determine mci pointer from the error descriptor
  EDAC: Store error type in struct edac_raw_error_desc
  EDAC/mc: Reorder functions edac_mc_alloc*()
  EDAC/mc: Split edac_mc_alloc() into smaller functions
  EDAC/mc: Change mci device removal to use put_device()
parents c271bdbf 41dac9a2
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/edac/dmc-520.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: ARM DMC-520 EDAC bindings
maintainers:
- Lei Wang <lewan@microsoft.com>
description: |+
DMC-520 node is defined to describe DRAM error detection and correction.
https://static.docs.arm.com/100000/0200/corelink_dmc520_trm_100000_0200_01_en.pdf
properties:
compatible:
items:
- const: brcm,dmc-520
- const: arm,dmc-520
reg:
maxItems: 1
interrupts:
minItems: 1
maxItems: 10
interrupt-names:
minItems: 1
maxItems: 10
items:
enum:
- ram_ecc_errc
- ram_ecc_errd
- dram_ecc_errc
- dram_ecc_errd
- failed_access
- failed_prog
- link_err
- temperature_event
- arch_fsm
- phy_request
required:
- compatible
- reg
- interrupts
- interrupt-names
examples:
- |
dmc0: dmc@200000 {
compatible = "brcm,dmc-520", "arm,dmc-520";
reg = <0x200000 0x80000>;
interrupts = <0x0 0x349 0x4>, <0x0 0x34B 0x4>;
interrupt-names = "dram_ecc_errc", "dram_ecc_errd";
};
......@@ -5998,6 +5998,12 @@ F: Documentation/driver-api/edac.rst
F: drivers/edac/
F: include/linux/edac.h
EDAC-DMC520
M: Lei Wang <lewan@microsoft.com>
L: linux-edac@vger.kernel.org
S: Supported
F: drivers/edac/dmc520_edac.c
EDAC-E752X
M: Mark Gross <mark.gross@intel.com>
L: linux-edac@vger.kernel.org
......
......@@ -523,4 +523,11 @@ config EDAC_BLUEFIELD
Support for error detection and correction on the
Mellanox BlueField SoCs.
config EDAC_DMC520
tristate "ARM DMC-520 ECC"
depends on ARM64
help
Support for error detection and correction on the
SoCs with ARM DMC-520 DRAM controller.
endif # EDAC
......@@ -87,3 +87,4 @@ obj-$(CONFIG_EDAC_TI) += ti_edac.o
obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o
obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o
obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o
obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o
......@@ -429,26 +429,26 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci)
src = (attr_cap & AURORA_ERR_ATTR_SRC_MSK) >> AURORA_ERR_ATTR_SRC_OFF;
if (src <= 3)
len += snprintf(msg+len, size-len, "src=CPU%d ", src);
len += scnprintf(msg+len, size-len, "src=CPU%d ", src);
else
len += snprintf(msg+len, size-len, "src=IO ");
len += scnprintf(msg+len, size-len, "src=IO ");
txn = (attr_cap & AURORA_ERR_ATTR_TXN_MSK) >> AURORA_ERR_ATTR_TXN_OFF;
switch (txn) {
case 0:
len += snprintf(msg+len, size-len, "txn=Data-Read ");
len += scnprintf(msg+len, size-len, "txn=Data-Read ");
break;
case 1:
len += snprintf(msg+len, size-len, "txn=Isn-Read ");
len += scnprintf(msg+len, size-len, "txn=Isn-Read ");
break;
case 2:
len += snprintf(msg+len, size-len, "txn=Clean-Flush ");
len += scnprintf(msg+len, size-len, "txn=Clean-Flush ");
break;
case 3:
len += snprintf(msg+len, size-len, "txn=Eviction ");
len += scnprintf(msg+len, size-len, "txn=Eviction ");
break;
case 4:
len += snprintf(msg+len, size-len,
len += scnprintf(msg+len, size-len,
"txn=Read-Modify-Write ");
break;
}
......@@ -456,19 +456,19 @@ static void aurora_l2_check(struct edac_device_ctl_info *dci)
err = (attr_cap & AURORA_ERR_ATTR_ERR_MSK) >> AURORA_ERR_ATTR_ERR_OFF;
switch (err) {
case 0:
len += snprintf(msg+len, size-len, "err=CorrECC ");
len += scnprintf(msg+len, size-len, "err=CorrECC ");
break;
case 1:
len += snprintf(msg+len, size-len, "err=UnCorrECC ");
len += scnprintf(msg+len, size-len, "err=UnCorrECC ");
break;
case 2:
len += snprintf(msg+len, size-len, "err=TagParity ");
len += scnprintf(msg+len, size-len, "err=TagParity ");
break;
}
len += snprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK);
len += snprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF);
len += snprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET);
len += scnprintf(msg+len, size-len, "addr=0x%x ", addr_cap & AURORA_ERR_ADDR_CAP_ADDR_MASK);
len += scnprintf(msg+len, size-len, "index=0x%x ", (way_cap & AURORA_ERR_WAY_IDX_MSK) >> AURORA_ERR_WAY_IDX_OFF);
len += scnprintf(msg+len, size-len, "way=0x%x", (way_cap & AURORA_ERR_WAY_CAP_WAY_MASK) >> AURORA_ERR_WAY_CAP_WAY_OFFSET);
/* clear error capture registers */
writel(AURORA_ERR_ATTR_CAP_VALID, drvdata->base + AURORA_ERR_ATTR_CAP_REG);
......
This diff is collapsed.
This diff is collapsed.
......@@ -212,17 +212,13 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* edac_raw_mc_handle_error() - Reports a memory event to userspace without
* doing anything to discover the error location.
*
* @type: severity of the error (CE/UE/Fatal)
* @mci: a struct mem_ctl_info pointer
* @e: error description
*
* This raw function is used internally by edac_mc_handle_error(). It should
* only be called directly when the hardware error come directly from BIOS,
* like in the case of APEI GHES driver.
*/
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
struct edac_raw_error_desc *e);
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e);
/**
* edac_mc_handle_error() - Reports a memory event to userspace.
......
......@@ -274,14 +274,8 @@ static const struct attribute_group *csrow_attr_groups[] = {
NULL
};
static void csrow_attr_release(struct device *dev)
{
/* release device with _edac_mc_free() */
}
static const struct device_type csrow_attr_type = {
.groups = csrow_attr_groups,
.release = csrow_attr_release,
};
/*
......@@ -387,6 +381,14 @@ static const struct attribute_group *csrow_dev_groups[] = {
NULL
};
static void csrow_release(struct device *dev)
{
/*
* Nothing to do, just unregister sysfs here. The mci
* device owns the data and will also release it.
*/
}
static inline int nr_pages_per_csrow(struct csrow_info *csrow)
{
int chan, nr_pages = 0;
......@@ -405,6 +407,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci,
csrow->dev.type = &csrow_attr_type;
csrow->dev.groups = csrow_dev_groups;
csrow->dev.release = csrow_release;
device_initialize(&csrow->dev);
csrow->dev.parent = &mci->dev;
csrow->mci = mci;
......@@ -441,10 +444,8 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci)
error:
for (--i; i >= 0; i--) {
csrow = mci->csrows[i];
if (!nr_pages_per_csrow(csrow))
continue;
device_unregister(&mci->csrows[i]->dev);
if (device_is_registered(&mci->csrows[i]->dev))
device_unregister(&mci->csrows[i]->dev);
}
return err;
......@@ -453,15 +454,13 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci)
static void edac_delete_csrow_objects(struct mem_ctl_info *mci)
{
int i;
struct csrow_info *csrow;
for (i = mci->nr_csrows - 1; i >= 0; i--) {
csrow = mci->csrows[i];
if (!nr_pages_per_csrow(csrow))
continue;
device_unregister(&mci->csrows[i]->dev);
for (i = 0; i < mci->nr_csrows; i++) {
if (device_is_registered(&mci->csrows[i]->dev))
device_unregister(&mci->csrows[i]->dev);
}
}
#endif
/*
......@@ -552,10 +551,8 @@ static ssize_t dimmdev_ce_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][dimm->idx];
return sprintf(data, "%u\n", count);
return sprintf(data, "%u\n", dimm->ce_count);
}
static ssize_t dimmdev_ue_count_show(struct device *dev,
......@@ -563,10 +560,8 @@ static ssize_t dimmdev_ue_count_show(struct device *dev,
char *data)
{
struct dimm_info *dimm = to_dimm(dev);
u32 count;
count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][dimm->idx];
return sprintf(data, "%u\n", count);
return sprintf(data, "%u\n", dimm->ue_count);
}
/* dimm/rank attribute files */
......@@ -602,16 +597,18 @@ static const struct attribute_group *dimm_attr_groups[] = {
NULL
};
static void dimm_attr_release(struct device *dev)
{
/* release device with _edac_mc_free() */
}
static const struct device_type dimm_attr_type = {
.groups = dimm_attr_groups,
.release = dimm_attr_release,
};
static void dimm_release(struct device *dev)
{
/*
* Nothing to do, just unregister sysfs here. The mci
* device owns the data and will also release it.
*/
}
/* Create a DIMM object under specifed memory controller device */
static int edac_create_dimm_object(struct mem_ctl_info *mci,
struct dimm_info *dimm)
......@@ -620,6 +617,7 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci,
dimm->mci = mci;
dimm->dev.type = &dimm_attr_type;
dimm->dev.release = dimm_release;
device_initialize(&dimm->dev);
dimm->dev.parent = &mci->dev;
......@@ -659,7 +657,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
const char *data, size_t count)
{
struct mem_ctl_info *mci = to_mci(dev);
int cnt, row, chan, i;
struct dimm_info *dimm;
int row, chan;
mci->ue_mc = 0;
mci->ce_mc = 0;
mci->ue_noinfo_count = 0;
......@@ -675,11 +675,9 @@ static ssize_t mci_reset_counters_store(struct device *dev,
ri->channels[chan]->ce_count = 0;
}
cnt = 1;
for (i = 0; i < mci->n_layers; i++) {
cnt *= mci->layers[i].size;
memset(mci->ce_per_layer[i], 0, cnt * sizeof(u32));
memset(mci->ue_per_layer[i], 0, cnt * sizeof(u32));
mci_for_each_dimm(mci, dimm) {
dimm->ue_count = 0;
dimm->ce_count = 0;
}
mci->start_time = jiffies;
......@@ -884,14 +882,8 @@ static const struct attribute_group *mci_attr_groups[] = {
NULL
};
static void mci_attr_release(struct device *dev)
{
/* release device with _edac_mc_free() */
}
static const struct device_type mci_attr_type = {
.groups = mci_attr_groups,
.release = mci_attr_release,
};
/*
......@@ -910,8 +902,6 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
/* get the /sys/devices/system/edac subsys reference */
mci->dev.type = &mci_attr_type;
device_initialize(&mci->dev);
mci->dev.parent = mci_pdev;
mci->dev.groups = groups;
dev_set_name(&mci->dev, "mc%d", mci->mc_idx);
......@@ -921,7 +911,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
err = device_add(&mci->dev);
if (err < 0) {
edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev));
put_device(&mci->dev);
/* no put_device() here, free mci with _edac_mc_free() */
return err;
}
......@@ -937,24 +927,20 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
err = edac_create_dimm_object(mci, dimm);
if (err)
goto fail_unregister_dimm;
goto fail;
}
#ifdef CONFIG_EDAC_LEGACY_SYSFS
err = edac_create_csrow_objects(mci);
if (err < 0)
goto fail_unregister_dimm;
goto fail;
#endif
edac_create_debugfs_nodes(mci);
return 0;
fail_unregister_dimm:
mci_for_each_dimm(mci, dimm) {
if (device_is_registered(&dimm->dev))
device_unregister(&dimm->dev);
}
device_unregister(&mci->dev);
fail:
edac_remove_sysfs_mci_device(mci);
return err;
}
......@@ -966,6 +952,9 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
{
struct dimm_info *dimm;
if (!device_is_registered(&mci->dev))
return;
edac_dbg(0, "\n");
#ifdef CONFIG_EDAC_DEBUG
......@@ -976,17 +965,14 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
#endif
mci_for_each_dimm(mci, dimm) {
if (dimm->nr_pages == 0)
if (!device_is_registered(&dimm->dev))
continue;
edac_dbg(1, "unregistering device %s\n", dev_name(&dimm->dev));
device_unregister(&dimm->dev);
}
}
void edac_unregister_sysfs(struct mem_ctl_info *mci)
{
edac_dbg(1, "unregistering device %s\n", dev_name(&mci->dev));
device_unregister(&mci->dev);
/* only remove the device, but keep mci */
device_del(&mci->dev);
}
static void mc_attr_release(struct device *dev)
......@@ -1000,9 +986,6 @@ static void mc_attr_release(struct device *dev)
kfree(dev);
}
static const struct device_type mc_attr_type = {
.release = mc_attr_release,
};
/*
* Init/exit code for the module. Basically, creates/removes /sys/class/rc
*/
......@@ -1015,11 +998,10 @@ int __init edac_mc_sysfs_init(void)
return -ENOMEM;
mci_pdev->bus = edac_get_sysfs_subsys();
mci_pdev->type = &mc_attr_type;
device_initialize(mci_pdev);
dev_set_name(mci_pdev, "mc");
mci_pdev->release = mc_attr_release;
mci_pdev->init_name = "mc";
err = device_add(mci_pdev);
err = device_register(mci_pdev);
if (err < 0) {
edac_dbg(1, "failure: create device %s\n", dev_name(mci_pdev));
put_device(mci_pdev);
......
......@@ -28,7 +28,6 @@ void edac_mc_sysfs_exit(void);
extern int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
const struct attribute_group **groups);
extern void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci);
void edac_unregister_sysfs(struct mem_ctl_info *mci);
extern int edac_get_log_ue(void);
extern int edac_get_log_ce(void);
extern int edac_get_panic_on_ue(void);
......
......@@ -201,7 +201,6 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
enum hw_event_mc_err_type type;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
struct ghes_edac_pvt *pvt;
......@@ -240,17 +239,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
switch (sev) {
case GHES_SEV_CORRECTED:
type = HW_EVENT_ERR_CORRECTED;
e->type = HW_EVENT_ERR_CORRECTED;
break;
case GHES_SEV_RECOVERABLE:
type = HW_EVENT_ERR_UNCORRECTED;
e->type = HW_EVENT_ERR_UNCORRECTED;
break;
case GHES_SEV_PANIC:
type = HW_EVENT_ERR_FATAL;
e->type = HW_EVENT_ERR_FATAL;
break;
default:
case GHES_SEV_NO:
type = HW_EVENT_ERR_INFO;
e->type = HW_EVENT_ERR_INFO;
}
edac_dbg(1, "error validation_bits: 0x%08llx\n",
......@@ -356,11 +355,8 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
mem_err->mem_dev_handle);
index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
if (index >= 0) {
if (index >= 0)
e->top_layer = index;
e->enable_per_layer_report = true;
}
}
if (p > e->location)
*(p - 1) = '\0';
......@@ -442,7 +438,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
if (p > pvt->other_detail)
*(p - 1) = '\0';
edac_raw_mc_handle_error(type, mci, e);
edac_raw_mc_handle_error(e);
unlock:
spin_unlock_irqrestore(&ghes_lock, flags);
......
......@@ -1239,7 +1239,7 @@ static int __init mce_amd_init(void)
case 0x17:
case 0x18:
pr_warn("Decoding supported only on Scalable MCA processors.\n");
pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
return -EINVAL;
default:
......
......@@ -477,16 +477,16 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p)
if (p->ce_cnt) {
pinf = &p->ceinfo;
if (!priv->p_data->quirks) {
if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) {
snprintf(priv->message, SYNPS_EDAC_MSG_SIZE,
"DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x",
"CE", pinf->row, pinf->bank, pinf->col,
"DDR ECC error type:%s Row %d Bank %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x",
"CE", pinf->row, pinf->bank,
pinf->bankgrpnr, pinf->blknr,
pinf->bitpos, pinf->data);
} else {
snprintf(priv->message, SYNPS_EDAC_MSG_SIZE,
"DDR ECC error type:%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d Bit Position: %d Data: 0x%08x",
"DDR ECC error type:%s Row %d Bank %d Col %d Bit Position: %d Data: 0x%08x",
"CE", pinf->row, pinf->bank, pinf->col,
pinf->bankgrpnr, pinf->blknr,
pinf->bitpos, pinf->data);
}
......@@ -497,15 +497,15 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p)
if (p->ue_cnt) {
pinf = &p->ueinfo;
if (!priv->p_data->quirks) {
if (priv->p_data->quirks & DDR_ECC_INTR_SUPPORT) {
snprintf(priv->message, SYNPS_EDAC_MSG_SIZE,
"DDR ECC error type :%s Row %d Bank %d Col %d ",
"UE", pinf->row, pinf->bank, pinf->col);
"DDR ECC error type :%s Row %d Bank %d BankGroup Number %d Block Number %d",
"UE", pinf->row, pinf->bank,
pinf->bankgrpnr, pinf->blknr);
} else {
snprintf(priv->message, SYNPS_EDAC_MSG_SIZE,
"DDR ECC error type :%s Row %d Bank %d Col %d BankGroup Number %d Block Number %d",
"UE", pinf->row, pinf->bank, pinf->col,
pinf->bankgrpnr, pinf->blknr);
"DDR ECC error type :%s Row %d Bank %d Col %d ",
"UE", pinf->row, pinf->bank, pinf->col);
}
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
......
......@@ -383,6 +383,9 @@ struct dimm_info {
unsigned int csrow, cschannel; /* Points to the old API data */
u16 smbios_handle; /* Handle for SMBIOS type 17 */
u32 ce_count;
u32 ue_count;
};
/**
......@@ -442,6 +445,7 @@ struct errcount_attribute_data {
* struct edac_raw_error_desc - Raw error report structure
* @grain: minimum granularity for an error report, in bytes
* @error_count: number of errors of the same type
* @type: severity of the error (CE/UE/Fatal)
* @top_layer: top layer of the error (layer[0])
* @mid_layer: middle layer of the error (layer[1])
* @low_layer: low layer of the error (layer[2])
......@@ -453,8 +457,6 @@ struct errcount_attribute_data {
* @location: location of the error
* @label: label of the affected DIMM(s)
* @other_detail: other driver-specific detail about the error
* @enable_per_layer_report: if false, the error affects all layers
* (typically, a memory controller error)
*/
struct edac_raw_error_desc {
char location[LOCATION_SIZE];
......@@ -462,6 +464,7 @@ struct edac_raw_error_desc {
long grain;
u16 error_count;
enum hw_event_mc_err_type type;
int top_layer;
int mid_layer;
int low_layer;
......@@ -470,7 +473,6 @@ struct edac_raw_error_desc {
unsigned long syndrome;
const char *msg;
const char *other_detail;
bool enable_per_layer_report;
};
/* MEMORY controller information structure
......@@ -560,7 +562,6 @@ struct mem_ctl_info {
*/
u32 ce_noinfo_count, ue_noinfo_count;
u32 ue_mc, ce_mc;
u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
struct completion complete;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment