Commit 4275be63 authored by Mauro Carvalho Chehab's avatar Mauro Carvalho Chehab

edac: Change internal representation to work with layers

Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core was written with the idea that memory controllers
are able to directly access csrows.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMMs, instead of ranks.

So, change the allocation and error report routines to allow
them to work with all types of architectures.

This will allow the removal of several hacks with FB-DIMM and RAMBUS
memory controllers.

Also, several tests were done on different platforms using different
x86 drivers.

TODO: a multi-rank DIMMs are currently represented by multiple DIMM
entries in struct dimm_info. That means that changing a label for one
rank won't change the same label for the other ranks at the same DIMM.
This bug is present since the beginning of the EDAC, so it is not a big
deal. However, on several drivers, it is possible to fix this issue, but
it should be a per-driver fix, as the csrow => DIMM arrangement may not
be equal for all. So, don't try to fix it here yet.

I tried to make this patch as short as possible, preceding it with
several other patches that simplified the logic here. Yet, as the
internal API changes, all drivers need changes. The changes are
generally bigger in the drivers for FB-DIMMs.

Cc: Aristeu Rozanski <arozansk@redhat.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Mark Gross <mark.gross@intel.com>
Cc: Jason Uhlenkott <juhlenko@akamai.com>
Cc: Tim Small <tim@buttersideup.com>
Cc: Ranganathan Desikan <ravi@jetztechnologies.com>
Cc: "Arvind R." <arvino55@gmail.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Egor Martovetsky <egor@pasemi.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joe Perches <joe@perches.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Hitoshi Mitake <h.mitake@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Niklas Söderlund" <niklas.soderlund@ericsson.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: default avatarMauro Carvalho Chehab <mchehab@redhat.com>
parent 982216a4
......@@ -447,8 +447,12 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
#endif /* CONFIG_PCI */
extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
unsigned nr_chans, int edac_index);
struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
unsigned nr_chans, int edac_index);
struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
unsigned n_layers,
struct edac_mc_layer *layers,
unsigned sz_pvt);
extern int edac_mc_add_mc(struct mem_ctl_info *mci);
extern void edac_mc_free(struct mem_ctl_info *mci);
extern struct mem_ctl_info *edac_mc_find(int idx);
......@@ -467,24 +471,78 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
* reporting logic and function interface - reduces conditional
* statement clutter and extra function arguments.
*/
extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
unsigned long page_frame_number,
unsigned long offset_in_page,
unsigned long syndrome, int row, int channel,
const char *msg);
extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
const char *msg);
extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
unsigned long page_frame_number,
unsigned long offset_in_page, int row,
const char *msg);
extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
const char *msg);
extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow,
unsigned int channel0, unsigned int channel1,
char *msg);
extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow,
unsigned int channel, char *msg);
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
const unsigned long syndrome,
const int layer0,
const int layer1,
const int layer2,
const char *msg,
const char *other_detail,
const void *mcelog);
static inline void edac_mc_handle_ce(struct mem_ctl_info *mci,
unsigned long page_frame_number,
unsigned long offset_in_page,
unsigned long syndrome, int row, int channel,
const char *msg)
{
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
page_frame_number, offset_in_page, syndrome,
row, channel, -1, msg, NULL, NULL);
}
static inline void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
const char *msg)
{
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
0, 0, 0, -1, -1, -1, msg, NULL, NULL);
}
static inline void edac_mc_handle_ue(struct mem_ctl_info *mci,
unsigned long page_frame_number,
unsigned long offset_in_page, int row,
const char *msg)
{
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
page_frame_number, offset_in_page, 0,
row, -1, -1, msg, NULL, NULL);
}
static inline void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
const char *msg)
{
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
0, 0, 0, -1, -1, -1, msg, NULL, NULL);
}
static inline void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
unsigned int csrow,
unsigned int channel0,
unsigned int channel1,
char *msg)
{
/*
*FIXME: The error can also be at channel1 (e. g. at the second
* channel of the same branch). The fix is to push
* edac_mc_handle_error() call into each driver
*/
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
0, 0, 0,
csrow, channel0, -1, msg, NULL, NULL);
}
static inline void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
unsigned int csrow,
unsigned int channel, char *msg)
{
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
0, 0, 0,
csrow, channel, -1, msg, NULL, NULL);
}
/*
* edac_device APIs
......@@ -496,6 +554,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
int inst_nr, int block_nr, const char *msg);
extern int edac_device_alloc_index(void);
extern const char *edac_layer_name[];
/*
* edac_pci APIs
......
This diff is collapsed.
......@@ -416,18 +416,20 @@ struct edac_mc_layer {
/* FIXME: add the proper per-location error counts */
struct dimm_info {
char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */
unsigned memory_controller;
unsigned csrow;
unsigned csrow_channel;
/* Memory location data */
unsigned location[EDAC_MAX_LAYERS];
struct mem_ctl_info *mci; /* the parent */
u32 grain; /* granularity of reported error in bytes */
enum dev_type dtype; /* memory device type */
enum mem_type mtype; /* memory dimm type */
enum edac_type edac_mode; /* EDAC mode for this dimm */
u32 nr_pages; /* number of pages in csrow */
u32 nr_pages; /* number of pages on this dimm */
u32 ce_count; /* Correctable Errors for this dimm */
unsigned csrow, cschannel; /* Points to the old API data */
};
/**
......@@ -447,9 +449,10 @@ struct dimm_info {
*/
struct rank_info {
int chan_idx;
u32 ce_count;
struct csrow_info *csrow;
struct dimm_info *dimm;
u32 ce_count; /* Correctable Errors for this csrow */
};
struct csrow_info {
......@@ -545,13 +548,18 @@ struct mem_ctl_info {
unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci,
unsigned long page);
int mc_idx;
int nr_csrows;
struct csrow_info *csrows;
unsigned nr_csrows, num_cschannel;
/* Memory Controller hierarchy */
unsigned n_layers;
struct edac_mc_layer *layers;
bool mem_is_per_rank;
/*
* DIMM info. Will eventually remove the entire csrows_info some day
*/
unsigned nr_dimms;
unsigned tot_dimms;
struct dimm_info *dimms;
/*
......@@ -566,12 +574,16 @@ struct mem_ctl_info {
const char *dev_name;
char proc_name[MC_PROC_NAME_MAX_LEN + 1];
void *pvt_info;
u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */
u32 ce_noinfo_count; /* Correctable Errors w/o info */
u32 ue_count; /* Total Uncorrectable Errors for this MC */
u32 ce_count; /* Total Correctable Errors for this MC */
unsigned long start_time; /* mci load start time (in jiffies) */
/*
* drivers shouldn't access those fields directly, as the core
* already handles that.
*/
u32 ce_noinfo_count, ue_noinfo_count;
u32 ue_count, ce_count;
u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
struct completion complete;
/* edac sysfs device control */
......@@ -584,7 +596,7 @@ struct mem_ctl_info {
* by the low level driver.
*
* Set by the low level driver to provide attributes at the
* controller level, same level as 'ue_count' and 'ce_count' above.
* controller level.
* An array of structures, NULL terminated
*
* If attributes are desired, then set to array of attributes
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment