Commit 1cc3880a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'edac_for_4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp

Pull EDAC updates from Borislav Petkov:
 "It was pretty busy in EDAC land this time:

   - Altera Arria10 L2 cache and On-Chip RAM ECC handling (Thor Thayer)

   - Remove ad-hoc buffering of MCE records in sb_edac and i7core_edac
     (Tony Luck)

   - Do not register sb_edac with pci_register_driver() (Tony Luck)

   - Add support for Skylake to ie31200_edac (Jason Baron)

   - Do not register amd64_edac with pci_register_driver() (Borislav
     Petkov)

  ... plus the usual round of cleanups and fixes all over the place"

* tag 'edac_for_4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: (25 commits)
  EDAC, amd64_edac: Drop pci_register_driver() use
  EDAC, ie31200_edac: Add Skylake support
  EDAC, sb_edac: Use cpu family/model in driver detection
  EDAC, i7core: Remove double buffering of error records
  EDAC, amd64_edac: Issue driver banner only on success
  ARM: socfpga: Initialize Arria10 OCRAM ECC on startup
  EDAC: Increment correct counter in edac_inc_ue_error()
  EDAC, sb_edac: Remove double buffering of error records
  EDAC: Fix used after kfree() error in edac_unregister_sysfs()
  EDAC, altera: Avoid unused function warnings
  EDAC, altera: Remove useless casts
  ARM: socfpga: Enable Arria10 OCRAM ECC on startup
  EDAC, altera: Add Arria10 OCRAM ECC support
  Documentation: dt: socfpga: Add Altera Arria10 OCRAM binding
  EDAC, altera: Make OCRAM ECC dependency check generic
  EDAC, altera: Add register offset for ECC Enable
  EDAC, altera: Extract error inject operations to a struct fops
  ARM: socfpga: Enable Arria10 L2 cache ECC on startup
  EDAC, altera: Add Arria10 L2 Cache ECC handling
  Documentation, dt, socfpga: Add Altera Arria10 L2 cache binding
  ...
parents 490e1422 3f37a36b
......@@ -3,6 +3,7 @@ This driver uses the EDAC framework to implement the SOCFPGA ECC Manager.
The ECC Manager counts and corrects single bit errors and counts/handles
double bit errors which are uncorrectable.
Cyclone5 and Arria5 ECC Manager
Required Properties:
- compatible : Should be "altr,socfpga-ecc-manager"
- #address-cells: must be 1
......@@ -47,3 +48,52 @@ Example:
interrupts = <0 178 1>, <0 179 1>;
};
};
Arria10 SoCFPGA ECC Manager
The Arria10 SoC ECC Manager handles the IRQs for each peripheral
in a shared register instead of individual IRQs like the Cyclone5
and Arria5. Therefore the device tree is different as well.
Required Properties:
- compatible : Should be "altr,socfpga-a10-ecc-manager"
- altr,sysgr-syscon : phandle to Arria10 System Manager Block
containing the ECC manager registers.
- #address-cells: must be 1
- #size-cells: must be 1
- interrupts : Should be single bit error interrupt, then double bit error
interrupt. Note the rising edge type.
- ranges : standard definition, should translate from local addresses
Subcomponents:
L2 Cache ECC
Required Properties:
- compatible : Should be "altr,socfpga-a10-l2-ecc"
- reg : Address and size for ECC error interrupt clear registers.
On-Chip RAM ECC
Required Properties:
- compatible : Should be "altr,socfpga-a10-ocram-ecc"
- reg : Address and size for ECC block registers.
Example:
eccmgr: eccmgr@ffd06000 {
compatible = "altr,socfpga-a10-ecc-manager";
altr,sysmgr-syscon = <&sysmgr>;
#address-cells = <1>;
#size-cells = <1>;
interrupts = <0 2 IRQ_TYPE_LEVEL_HIGH>,
<0 0 IRQ_TYPE_LEVEL_HIGH>;
ranges;
l2-ecc@ffd06010 {
compatible = "altr,socfpga-a10-l2-ecc";
reg = <0xffd06010 0x4>;
};
ocram-ecc@ff8c3000 {
compatible = "altr,socfpga-a10-ocram-ecc";
reg = <0xff8c3000 0x90>;
};
};
......@@ -38,6 +38,8 @@ extern void socfpga_init_clocks(void);
extern void socfpga_sysmgr_init(void);
void socfpga_init_l2_ecc(void);
void socfpga_init_ocram_ecc(void);
void socfpga_init_arria10_l2_ecc(void);
void socfpga_init_arria10_ocram_ecc(void);
extern void __iomem *sys_manager_base_addr;
extern void __iomem *rst_manager_base_addr;
......
......@@ -17,6 +17,20 @@
#include <linux/of_platform.h>
#include <linux/of_address.h>
#include "core.h"
/* A10 System Manager L2 ECC Control register */
#define A10_MPU_CTRL_L2_ECC_OFST 0x0
#define A10_MPU_CTRL_L2_ECC_EN BIT(0)
/* A10 System Manager Global IRQ Mask register */
#define A10_SYSMGR_ECC_INTMASK_CLR_OFST 0x98
#define A10_SYSMGR_ECC_INTMASK_CLR_L2 BIT(0)
/* A10 System Manager L2 ECC IRQ Clear register */
#define A10_SYSMGR_MPU_CLEAR_L2_ECC_OFST 0xA8
#define A10_SYSMGR_MPU_CLEAR_L2_ECC (BIT(31) | BIT(15))
void socfpga_init_l2_ecc(void)
{
struct device_node *np;
......@@ -39,3 +53,38 @@ void socfpga_init_l2_ecc(void)
writel(0x01, mapped_l2_edac_addr);
iounmap(mapped_l2_edac_addr);
}
void socfpga_init_arria10_l2_ecc(void)
{
struct device_node *np;
void __iomem *mapped_l2_edac_addr;
/* Find the L2 EDAC device tree node */
np = of_find_compatible_node(NULL, NULL, "altr,socfpga-a10-l2-ecc");
if (!np) {
pr_err("Unable to find socfpga-a10-l2-ecc in dtb\n");
return;
}
mapped_l2_edac_addr = of_iomap(np, 0);
of_node_put(np);
if (!mapped_l2_edac_addr) {
pr_err("Unable to find L2 ECC mapping in dtb\n");
return;
}
if (!sys_manager_base_addr) {
pr_err("System Mananger not mapped for L2 ECC\n");
goto exit;
}
/* Clear any pending IRQs */
writel(A10_SYSMGR_MPU_CLEAR_L2_ECC, (sys_manager_base_addr +
A10_SYSMGR_MPU_CLEAR_L2_ECC_OFST));
/* Enable ECC */
writel(A10_SYSMGR_ECC_INTMASK_CLR_L2, sys_manager_base_addr +
A10_SYSMGR_ECC_INTMASK_CLR_OFST);
writel(A10_MPU_CTRL_L2_ECC_EN, mapped_l2_edac_addr +
A10_MPU_CTRL_L2_ECC_OFST);
exit:
iounmap(mapped_l2_edac_addr);
}
......@@ -13,12 +13,15 @@
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/genalloc.h>
#include <linux/module.h>
#include <linux/of_address.h>
#include <linux/of_platform.h>
#include "core.h"
#define ALTR_OCRAM_CLEAR_ECC 0x00000018
#define ALTR_OCRAM_ECC_EN 0x00000019
......@@ -47,3 +50,133 @@ void socfpga_init_ocram_ecc(void)
iounmap(mapped_ocr_edac_addr);
}
/* Arria10 OCRAM Section */
#define ALTR_A10_ECC_CTRL_OFST 0x08
#define ALTR_A10_OCRAM_ECC_EN_CTL (BIT(1) | BIT(0))
#define ALTR_A10_ECC_INITA BIT(16)
#define ALTR_A10_ECC_INITSTAT_OFST 0x0C
#define ALTR_A10_ECC_INITCOMPLETEA BIT(0)
#define ALTR_A10_ECC_INITCOMPLETEB BIT(8)
#define ALTR_A10_ECC_ERRINTEN_OFST 0x10
#define ALTR_A10_ECC_SERRINTEN BIT(0)
#define ALTR_A10_ECC_INTSTAT_OFST 0x20
#define ALTR_A10_ECC_SERRPENA BIT(0)
#define ALTR_A10_ECC_DERRPENA BIT(8)
#define ALTR_A10_ECC_ERRPENA_MASK (ALTR_A10_ECC_SERRPENA | \
ALTR_A10_ECC_DERRPENA)
/* ECC Manager Defines */
#define A10_SYSMGR_ECC_INTMASK_SET_OFST 0x94
#define A10_SYSMGR_ECC_INTMASK_CLR_OFST 0x98
#define A10_SYSMGR_ECC_INTMASK_OCRAM BIT(1)
#define ALTR_A10_ECC_INIT_WATCHDOG_10US 10000
static inline void ecc_set_bits(u32 bit_mask, void __iomem *ioaddr)
{
u32 value = readl(ioaddr);
value |= bit_mask;
writel(value, ioaddr);
}
static inline void ecc_clear_bits(u32 bit_mask, void __iomem *ioaddr)
{
u32 value = readl(ioaddr);
value &= ~bit_mask;
writel(value, ioaddr);
}
static inline int ecc_test_bits(u32 bit_mask, void __iomem *ioaddr)
{
u32 value = readl(ioaddr);
return (value & bit_mask) ? 1 : 0;
}
/*
* This function uses the memory initialization block in the Arria10 ECC
* controller to initialize/clear the entire memory data and ECC data.
*/
static int altr_init_memory_port(void __iomem *ioaddr)
{
int limit = ALTR_A10_ECC_INIT_WATCHDOG_10US;
ecc_set_bits(ALTR_A10_ECC_INITA, (ioaddr + ALTR_A10_ECC_CTRL_OFST));
while (limit--) {
if (ecc_test_bits(ALTR_A10_ECC_INITCOMPLETEA,
(ioaddr + ALTR_A10_ECC_INITSTAT_OFST)))
break;
udelay(1);
}
if (limit < 0)
return -EBUSY;
/* Clear any pending ECC interrupts */
writel(ALTR_A10_ECC_ERRPENA_MASK,
(ioaddr + ALTR_A10_ECC_INTSTAT_OFST));
return 0;
}
void socfpga_init_arria10_ocram_ecc(void)
{
struct device_node *np;
int ret = 0;
void __iomem *ecc_block_base;
if (!sys_manager_base_addr) {
pr_err("SOCFPGA: sys-mgr is not initialized\n");
return;
}
/* Find the OCRAM EDAC device tree node */
np = of_find_compatible_node(NULL, NULL, "altr,socfpga-a10-ocram-ecc");
if (!np) {
pr_err("Unable to find socfpga-a10-ocram-ecc\n");
return;
}
/* Map the ECC Block */
ecc_block_base = of_iomap(np, 0);
of_node_put(np);
if (!ecc_block_base) {
pr_err("Unable to map OCRAM ECC block\n");
return;
}
/* Disable ECC */
writel(ALTR_A10_OCRAM_ECC_EN_CTL,
sys_manager_base_addr + A10_SYSMGR_ECC_INTMASK_SET_OFST);
ecc_clear_bits(ALTR_A10_ECC_SERRINTEN,
(ecc_block_base + ALTR_A10_ECC_ERRINTEN_OFST));
ecc_clear_bits(ALTR_A10_OCRAM_ECC_EN_CTL,
(ecc_block_base + ALTR_A10_ECC_CTRL_OFST));
/* Ensure all writes complete */
wmb();
/* Use HW initialization block to initialize memory for ECC */
ret = altr_init_memory_port(ecc_block_base);
if (ret) {
pr_err("ECC: cannot init OCRAM PORTA memory\n");
goto exit;
}
/* Enable ECC */
ecc_set_bits(ALTR_A10_OCRAM_ECC_EN_CTL,
(ecc_block_base + ALTR_A10_ECC_CTRL_OFST));
ecc_set_bits(ALTR_A10_ECC_SERRINTEN,
(ecc_block_base + ALTR_A10_ECC_ERRINTEN_OFST));
writel(ALTR_A10_OCRAM_ECC_EN_CTL,
sys_manager_base_addr + A10_SYSMGR_ECC_INTMASK_CLR_OFST);
/* Ensure all writes complete */
wmb();
exit:
iounmap(ecc_block_base);
}
......@@ -66,6 +66,16 @@ static void __init socfpga_init_irq(void)
socfpga_init_ocram_ecc();
}
static void __init socfpga_arria10_init_irq(void)
{
irqchip_init();
socfpga_sysmgr_init();
if (IS_ENABLED(CONFIG_EDAC_ALTERA_L2C))
socfpga_init_arria10_l2_ecc();
if (IS_ENABLED(CONFIG_EDAC_ALTERA_OCRAM))
socfpga_init_arria10_ocram_ecc();
}
static void socfpga_cyclone5_restart(enum reboot_mode mode, const char *cmd)
{
u32 temp;
......@@ -113,7 +123,7 @@ static const char *altera_a10_dt_match[] = {
DT_MACHINE_START(SOCFPGA_A10, "Altera SOCFPGA Arria10")
.l2c_aux_val = 0,
.l2c_aux_mask = ~0,
.init_irq = socfpga_init_irq,
.init_irq = socfpga_arria10_init_irq,
.restart = socfpga_arria10_restart,
.dt_compat = altera_a10_dt_match,
MACHINE_END
......@@ -378,12 +378,11 @@ config EDAC_ALTERA
config EDAC_ALTERA_L2C
bool "Altera L2 Cache ECC"
depends on EDAC_ALTERA=y
select CACHE_L2X0
depends on EDAC_ALTERA=y && CACHE_L2X0
help
Support for error detection and correction on the
Altera L2 cache Memory for Altera SoCs. This option
requires L2 cache so it will force that selection.
requires L2 cache.
config EDAC_ALTERA_OCRAM
bool "Altera On-Chip RAM ECC"
......
This diff is collapsed.
......@@ -195,4 +195,132 @@ struct altr_sdram_mc_data {
const struct altr_sdram_prv_data *data;
};
/************************** EDAC Device Defines **************************/
/***** General Device Trigger Defines *****/
#define ALTR_UE_TRIGGER_CHAR 'U' /* Trigger for UE */
#define ALTR_TRIGGER_READ_WRD_CNT 32 /* Line size x 4 */
#define ALTR_TRIG_OCRAM_BYTE_SIZE 128 /* Line size x 4 */
#define ALTR_TRIG_L2C_BYTE_SIZE 4096 /* Full Page */
/******* Cyclone5 and Arria5 Defines *******/
/* OCRAM ECC Management Group Defines */
#define ALTR_MAN_GRP_OCRAM_ECC_OFFSET 0x04
#define ALTR_OCR_ECC_REG_OFFSET 0x00
#define ALTR_OCR_ECC_EN BIT(0)
#define ALTR_OCR_ECC_INJS BIT(1)
#define ALTR_OCR_ECC_INJD BIT(2)
#define ALTR_OCR_ECC_SERR BIT(3)
#define ALTR_OCR_ECC_DERR BIT(4)
/* L2 ECC Management Group Defines */
#define ALTR_MAN_GRP_L2_ECC_OFFSET 0x00
#define ALTR_L2_ECC_REG_OFFSET 0x00
#define ALTR_L2_ECC_EN BIT(0)
#define ALTR_L2_ECC_INJS BIT(1)
#define ALTR_L2_ECC_INJD BIT(2)
/* Arria10 General ECC Block Module Defines */
#define ALTR_A10_ECC_CTRL_OFST 0x08
#define ALTR_A10_ECC_EN BIT(0)
#define ALTR_A10_ECC_INITA BIT(16)
#define ALTR_A10_ECC_INITB BIT(24)
#define ALTR_A10_ECC_INITSTAT_OFST 0x0C
#define ALTR_A10_ECC_INITCOMPLETEA BIT(0)
#define ALTR_A10_ECC_INITCOMPLETEB BIT(8)
#define ALTR_A10_ECC_ERRINTEN_OFST 0x10
#define ALTR_A10_ECC_SERRINTEN BIT(0)
#define ALTR_A10_ECC_INTSTAT_OFST 0x20
#define ALTR_A10_ECC_SERRPENA BIT(0)
#define ALTR_A10_ECC_DERRPENA BIT(8)
#define ALTR_A10_ECC_ERRPENA_MASK (ALTR_A10_ECC_SERRPENA | \
ALTR_A10_ECC_DERRPENA)
#define ALTR_A10_ECC_SERRPENB BIT(16)
#define ALTR_A10_ECC_DERRPENB BIT(24)
#define ALTR_A10_ECC_ERRPENB_MASK (ALTR_A10_ECC_SERRPENB | \
ALTR_A10_ECC_DERRPENB)
#define ALTR_A10_ECC_INTTEST_OFST 0x24
#define ALTR_A10_ECC_TSERRA BIT(0)
#define ALTR_A10_ECC_TDERRA BIT(8)
/* ECC Manager Defines */
#define A10_SYSMGR_ECC_INTMASK_SET_OFST 0x94
#define A10_SYSMGR_ECC_INTMASK_CLR_OFST 0x98
#define A10_SYSMGR_ECC_INTMASK_OCRAM BIT(1)
#define A10_SYSMGR_ECC_INTSTAT_SERR_OFST 0x9C
#define A10_SYSMGR_ECC_INTSTAT_DERR_OFST 0xA0
#define A10_SYSMGR_ECC_INTSTAT_L2 BIT(0)
#define A10_SYSMGR_ECC_INTSTAT_OCRAM BIT(1)
#define A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST 0xA8
#define A10_SYSGMR_MPU_CLEAR_L2_ECC_SB BIT(15)
#define A10_SYSGMR_MPU_CLEAR_L2_ECC_MB BIT(31)
/* Arria 10 L2 ECC Management Group Defines */
#define ALTR_A10_L2_ECC_CTL_OFST 0x0
#define ALTR_A10_L2_ECC_EN_CTL BIT(0)
#define ALTR_A10_L2_ECC_STATUS 0xFFD060A4
#define ALTR_A10_L2_ECC_STAT_OFST 0xA4
#define ALTR_A10_L2_ECC_SERR_PEND BIT(0)
#define ALTR_A10_L2_ECC_MERR_PEND BIT(0)
#define ALTR_A10_L2_ECC_CLR_OFST 0x4
#define ALTR_A10_L2_ECC_SERR_CLR BIT(15)
#define ALTR_A10_L2_ECC_MERR_CLR BIT(31)
#define ALTR_A10_L2_ECC_INJ_OFST ALTR_A10_L2_ECC_CTL_OFST
#define ALTR_A10_L2_ECC_CE_INJ_MASK 0x00000101
#define ALTR_A10_L2_ECC_UE_INJ_MASK 0x00010101
/* Arria 10 OCRAM ECC Management Group Defines */
#define ALTR_A10_OCRAM_ECC_EN_CTL (BIT(1) | BIT(0))
struct altr_edac_device_dev;
struct edac_device_prv_data {
int (*setup)(struct altr_edac_device_dev *device);
int ce_clear_mask;
int ue_clear_mask;
int irq_status_mask;
char dbgfs_name[20];
void * (*alloc_mem)(size_t size, void **other);
void (*free_mem)(void *p, size_t size, void *other);
int ecc_enable_mask;
int ecc_en_ofst;
int ce_set_mask;
int ue_set_mask;
int set_err_ofst;
irqreturn_t (*ecc_irq_handler)(struct altr_edac_device_dev *dci,
bool sb);
int trig_alloc_sz;
const struct file_operations *inject_fops;
};
struct altr_edac_device_dev {
struct list_head next;
void __iomem *base;
int sb_irq;
int db_irq;
const struct edac_device_prv_data *data;
struct dentry *debugfs_dir;
char *edac_dev_name;
struct altr_arria10_edac *edac;
struct edac_device_ctl_info *edac_dev;
struct device ddev;
int edac_idx;
};
struct altr_arria10_edac {
struct device *dev;
struct regmap *ecc_mgr_map;
int sb_irq;
int db_irq;
struct list_head a10_ecc_devices;
};
#endif /* #ifndef _ALTERA_EDAC_H */
......@@ -15,11 +15,6 @@ module_param(ecc_enable_override, int, 0644);
static struct msr __percpu *msrs;
/*
* count successfully initialized driver instances for setup_pci_device()
*/
static atomic_t drv_instances = ATOMIC_INIT(0);
/* Per-node stuff */
static struct ecc_settings **ecc_stngs;
......@@ -1918,7 +1913,7 @@ static struct amd64_family_type family_types[] = {
[K8_CPUS] = {
.ctl_name = "K8",
.f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
.f3_id = PCI_DEVICE_ID_AMD_K8_NB_MISC,
.f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
.ops = {
.early_channel_count = k8_early_channel_count,
.map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow,
......@@ -1928,7 +1923,7 @@ static struct amd64_family_type family_types[] = {
[F10_CPUS] = {
.ctl_name = "F10h",
.f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP,
.f3_id = PCI_DEVICE_ID_AMD_10H_NB_MISC,
.f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -1938,7 +1933,7 @@ static struct amd64_family_type family_types[] = {
[F15_CPUS] = {
.ctl_name = "F15h",
.f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1,
.f3_id = PCI_DEVICE_ID_AMD_15H_NB_F3,
.f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -1948,7 +1943,7 @@ static struct amd64_family_type family_types[] = {
[F15_M30H_CPUS] = {
.ctl_name = "F15h_M30h",
.f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1,
.f3_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F3,
.f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -1958,7 +1953,7 @@ static struct amd64_family_type family_types[] = {
[F15_M60H_CPUS] = {
.ctl_name = "F15h_M60h",
.f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1,
.f3_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F3,
.f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -1968,7 +1963,7 @@ static struct amd64_family_type family_types[] = {
[F16_CPUS] = {
.ctl_name = "F16h",
.f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1,
.f3_id = PCI_DEVICE_ID_AMD_16H_NB_F3,
.f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -1978,7 +1973,7 @@ static struct amd64_family_type family_types[] = {
[F16_M30H_CPUS] = {
.ctl_name = "F16h_M30h",
.f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1,
.f3_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F3,
.f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2,
.ops = {
.early_channel_count = f1x_early_channel_count,
.map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow,
......@@ -2227,13 +2222,13 @@ static inline void decode_bus_error(int node_id, struct mce *m)
}
/*
* Use pvt->F2 which contains the F2 CPU PCI device to get the related
* F1 (AddrMap) and F3 (Misc) devices. Return negative value on error.
* Use pvt->F3 which contains the F3 CPU PCI device to get the related
* F1 (AddrMap) and F2 (Dct) devices. Return negative value on error.
*/
static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f3_id)
static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f2_id)
{
/* Reserve the ADDRESS MAP Device */
pvt->F1 = pci_get_related_function(pvt->F2->vendor, f1_id, pvt->F2);
pvt->F1 = pci_get_related_function(pvt->F3->vendor, f1_id, pvt->F3);
if (!pvt->F1) {
amd64_err("error address map device not found: "
"vendor %x device 0x%x (broken BIOS?)\n",
......@@ -2241,15 +2236,15 @@ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f3_id)
return -ENODEV;
}
/* Reserve the MISC Device */
pvt->F3 = pci_get_related_function(pvt->F2->vendor, f3_id, pvt->F2);
if (!pvt->F3) {
/* Reserve the DCT Device */
pvt->F2 = pci_get_related_function(pvt->F3->vendor, f2_id, pvt->F3);
if (!pvt->F2) {
pci_dev_put(pvt->F1);
pvt->F1 = NULL;
amd64_err("error F3 device not found: "
amd64_err("error F2 device not found: "
"vendor %x device 0x%x (broken BIOS?)\n",
PCI_VENDOR_ID_AMD, f3_id);
PCI_VENDOR_ID_AMD, f2_id);
return -ENODEV;
}
......@@ -2263,7 +2258,7 @@ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f3_id)
static void free_mc_sibling_devs(struct amd64_pvt *pvt)
{
pci_dev_put(pvt->F1);
pci_dev_put(pvt->F3);
pci_dev_put(pvt->F2);
}
/*
......@@ -2778,14 +2773,14 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
NULL
};
static int init_one_instance(struct pci_dev *F2)
static int init_one_instance(unsigned int nid)
{
struct amd64_pvt *pvt = NULL;
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct amd64_family_type *fam_type = NULL;
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
struct amd64_pvt *pvt = NULL;
int err = 0, ret;
u16 nid = amd_pci_dev_to_node_id(F2);
ret = -ENOMEM;
pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
......@@ -2793,7 +2788,7 @@ static int init_one_instance(struct pci_dev *F2)
goto err_ret;
pvt->mc_node_id = nid;
pvt->F2 = F2;
pvt->F3 = F3;
ret = -EINVAL;
fam_type = per_family_init(pvt);
......@@ -2801,7 +2796,7 @@ static int init_one_instance(struct pci_dev *F2)
goto err_free;
ret = -ENODEV;
err = reserve_mc_sibling_devs(pvt, fam_type->f1_id, fam_type->f3_id);
err = reserve_mc_sibling_devs(pvt, fam_type->f1_id, fam_type->f2_id);
if (err)
goto err_free;
......@@ -2836,7 +2831,7 @@ static int init_one_instance(struct pci_dev *F2)
goto err_siblings;
mci->pvt_info = pvt;
mci->pdev = &pvt->F2->dev;
mci->pdev = &pvt->F3->dev;
setup_mci_misc_attrs(mci, fam_type);
......@@ -2855,8 +2850,6 @@ static int init_one_instance(struct pci_dev *F2)
amd_register_ecc_decoder(decode_bus_error);
atomic_inc(&drv_instances);
return 0;
err_add_mc:
......@@ -2872,19 +2865,11 @@ static int init_one_instance(struct pci_dev *F2)
return ret;
}
static int probe_one_instance(struct pci_dev *pdev,
const struct pci_device_id *mc_type)
static int probe_one_instance(unsigned int nid)
{
u16 nid = amd_pci_dev_to_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s;
int ret = 0;
ret = pci_enable_device(pdev);
if (ret < 0) {
edac_dbg(0, "ret=%d\n", ret);
return -EIO;
}
int ret;
ret = -ENOMEM;
s = kzalloc(sizeof(struct ecc_settings), GFP_KERNEL);
......@@ -2905,7 +2890,7 @@ static int probe_one_instance(struct pci_dev *pdev,
goto err_enable;
}
ret = init_one_instance(pdev);
ret = init_one_instance(nid);
if (ret < 0) {
amd64_err("Error probing instance: %d\n", nid);
restore_ecc_error_reporting(s, nid, F3);
......@@ -2921,19 +2906,18 @@ static int probe_one_instance(struct pci_dev *pdev,
return ret;
}
static void remove_one_instance(struct pci_dev *pdev)
static void remove_one_instance(unsigned int nid)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
u16 nid = amd_pci_dev_to_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s = ecc_stngs[nid];
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
mci = find_mci_by_dev(&pdev->dev);
mci = find_mci_by_dev(&F3->dev);
WARN_ON(!mci);
/* Remove from EDAC CORE tracking list */
mci = edac_mc_del_mc(&pdev->dev);
mci = edac_mc_del_mc(&F3->dev);
if (!mci)
return;
......@@ -2957,31 +2941,6 @@ static void remove_one_instance(struct pci_dev *pdev)
edac_mc_free(mci);
}
/*
* This table is part of the interface for loading drivers for PCI devices. The
* PCI core identifies what devices are on a system during boot, and then
* inquiry this table to see if this driver is for a given device found.
*/
static const struct pci_device_id amd64_pci_table[] = {
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F2) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F2) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F2) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F2) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F2) },
{0, }
};
MODULE_DEVICE_TABLE(pci, amd64_pci_table);
static struct pci_driver amd64_pci_driver = {
.name = EDAC_MOD_STR,
.probe = probe_one_instance,
.remove = remove_one_instance,
.id_table = amd64_pci_table,
.driver.probe_type = PROBE_FORCE_SYNCHRONOUS,
};
static void setup_pci_device(void)
{
struct mem_ctl_info *mci;
......@@ -3005,8 +2964,7 @@ static void setup_pci_device(void)
static int __init amd64_edac_init(void)
{
int err = -ENODEV;
printk(KERN_INFO "AMD64 EDAC driver v%s\n", EDAC_AMD64_VERSION);
int i;
opstate_init();
......@@ -3022,13 +2980,14 @@ static int __init amd64_edac_init(void)
if (!msrs)
goto err_free;
err = pci_register_driver(&amd64_pci_driver);
if (err)
goto err_pci;
for (i = 0; i < amd_nb_num(); i++)
if (probe_one_instance(i)) {
/* unwind properly */
while (--i >= 0)
remove_one_instance(i);
err = -ENODEV;
if (!atomic_read(&drv_instances))
goto err_no_instances;
goto err_pci;
}
setup_pci_device();
......@@ -3036,10 +2995,9 @@ static int __init amd64_edac_init(void)
amd64_err("%s on 32-bit is unsupported. USE AT YOUR OWN RISK!\n", EDAC_MOD_STR);
#endif
return 0;
printk(KERN_INFO "AMD64 EDAC driver v%s\n", EDAC_AMD64_VERSION);
err_no_instances:
pci_unregister_driver(&amd64_pci_driver);
return 0;
err_pci:
msrs_free(msrs);
......@@ -3055,10 +3013,13 @@ static int __init amd64_edac_init(void)
static void __exit amd64_edac_exit(void)
{
int i;
if (pci_ctl)
edac_pci_release_generic_ctl(pci_ctl);
pci_unregister_driver(&amd64_pci_driver);
for (i = 0; i < amd_nb_num(); i++)
remove_one_instance(i);
kfree(ecc_stngs);
ecc_stngs = NULL;
......
......@@ -422,7 +422,7 @@ struct low_ops {
struct amd64_family_type {
const char *ctl_name;
u16 f1_id, f3_id;
u16 f1_id, f2_id;
struct low_ops ops;
};
......
......@@ -923,7 +923,7 @@ static void edac_inc_ue_error(struct mem_ctl_info *mci,
mci->ue_mc += count;
if (!enable_per_layer_report) {
mci->ce_noinfo_count += count;
mci->ue_noinfo_count += count;
return;
}
......
......@@ -998,11 +998,12 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
void edac_unregister_sysfs(struct mem_ctl_info *mci)
{
struct bus_type *bus = mci->bus;
const char *name = mci->bus->name;
edac_dbg(1, "Unregistering device %s\n", dev_name(&mci->dev));
device_unregister(&mci->dev);
bus_unregister(mci->bus);
bus_unregister(bus);
kfree(name);
}
......
......@@ -271,16 +271,6 @@ struct i7core_pvt {
bool is_registered, enable_scrub;
/* Fifo double buffers */
struct mce mce_entry[MCE_LOG_LEN];
struct mce mce_outentry[MCE_LOG_LEN];
/* Fifo in/out counters */
unsigned mce_in, mce_out;
/* Count indicator to show errors not got */
unsigned mce_overrun;
/* DCLK Frequency used for computing scrub rate */
int dclk_freq;
......@@ -1792,56 +1782,15 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
* i7core_check_error Retrieve and process errors reported by the
* hardware. Called by the Core module.
*/
static void i7core_check_error(struct mem_ctl_info *mci)
static void i7core_check_error(struct mem_ctl_info *mci, struct mce *m)
{
struct i7core_pvt *pvt = mci->pvt_info;
int i;
unsigned count = 0;
struct mce *m;
/*
* MCE first step: Copy all mce errors into a temporary buffer
* We use a double buffering here, to reduce the risk of
* losing an error.
*/
smp_rmb();
count = (pvt->mce_out + MCE_LOG_LEN - pvt->mce_in)
% MCE_LOG_LEN;
if (!count)
goto check_ce_error;
m = pvt->mce_outentry;
if (pvt->mce_in + count > MCE_LOG_LEN) {
unsigned l = MCE_LOG_LEN - pvt->mce_in;
memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * l);
smp_wmb();
pvt->mce_in = 0;
count -= l;
m += l;
}
memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * count);
smp_wmb();
pvt->mce_in += count;
smp_rmb();
if (pvt->mce_overrun) {
i7core_printk(KERN_ERR, "Lost %d memory errors\n",
pvt->mce_overrun);
smp_wmb();
pvt->mce_overrun = 0;
}
/*
* MCE second step: parse errors and display
*/
for (i = 0; i < count; i++)
i7core_mce_output_error(mci, &pvt->mce_outentry[i]);
i7core_mce_output_error(mci, m);
/*
* Now, let's increment CE error counts
*/
check_ce_error:
if (!pvt->is_registered)
i7core_udimm_check_mc_ecc_err(mci);
else
......@@ -1849,12 +1798,8 @@ static void i7core_check_error(struct mem_ctl_info *mci)
}
/*
* i7core_mce_check_error Replicates mcelog routine to get errors
* This routine simply queues mcelog errors, and
* return. The error itself should be handled later
* by i7core_check_error.
* WARNING: As this routine should be called at NMI time, extra care should
* be taken to avoid deadlocks, and to be as fast as possible.
* Check that logging is enabled and that this is the right type
* of error for us to handle.
*/
static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
void *data)
......@@ -1882,21 +1827,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
if (mce->bank != 8)
return NOTIFY_DONE;
smp_rmb();
if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
smp_wmb();
pvt->mce_overrun++;
return NOTIFY_DONE;
}
/* Copy memory error at the ringbuffer */
memcpy(&pvt->mce_entry[pvt->mce_out], mce, sizeof(*mce));
smp_wmb();
pvt->mce_out = (pvt->mce_out + 1) % MCE_LOG_LEN;
/* Handle fatal errors immediately */
if (mce->mcgstatus & 1)
i7core_check_error(mci);
i7core_check_error(mci, mce);
/* Advise mcelog that the errors were handled */
return NOTIFY_STOP;
......@@ -2243,8 +2174,6 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev)
get_dimm_config(mci);
/* record ptr to the generic device */
mci->pdev = &i7core_dev->pdev[0]->dev;
/* Set the function pointer to an actual operation function */
mci->edac_check = i7core_check_error;
/* Enable scrubrate setting */
if (pvt->enable_scrub)
......
......@@ -17,6 +17,7 @@
* 015c: Xeon E3-1200 v2/3rd Gen Core processor DRAM Controller
* 0c04: Xeon E3-1200 v3/4th Gen Core Processor DRAM Controller
* 0c08: Xeon E3-1200 v3 Processor DRAM Controller
* 1918: Xeon E3-1200 v5 Skylake Host Bridge/DRAM Registers
*
* Based on Intel specification:
* http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e3-1200v3-vol-2-datasheet.pdf
......@@ -55,6 +56,7 @@
#define PCI_DEVICE_ID_INTEL_IE31200_HB_5 0x015c
#define PCI_DEVICE_ID_INTEL_IE31200_HB_6 0x0c04
#define PCI_DEVICE_ID_INTEL_IE31200_HB_7 0x0c08
#define PCI_DEVICE_ID_INTEL_IE31200_HB_8 0x1918
#define IE31200_DIMMS 4
#define IE31200_RANKS 8
......@@ -105,8 +107,11 @@
* 1 Multiple Bit Error Status (MERRSTS)
* 0 Correctable Error Status (CERRSTS)
*/
#define IE31200_C0ECCERRLOG 0x40c8
#define IE31200_C1ECCERRLOG 0x44c8
#define IE31200_C0ECCERRLOG_SKL 0x4048
#define IE31200_C1ECCERRLOG_SKL 0x4448
#define IE31200_ECCERRLOG_CE BIT(0)
#define IE31200_ECCERRLOG_UE BIT(1)
#define IE31200_ECCERRLOG_RANK_BITS GENMASK_ULL(28, 27)
......@@ -123,17 +128,28 @@
#define IE31200_CAPID0_DDPCD BIT(6)
#define IE31200_CAPID0_ECC BIT(1)
#define IE31200_MAD_DIMM_0_OFFSET 0x5004
#define IE31200_MAD_DIMM_SIZE GENMASK_ULL(7, 0)
#define IE31200_MAD_DIMM_A_RANK BIT(17)
#define IE31200_MAD_DIMM_A_WIDTH BIT(19)
#define IE31200_PAGES(n) (n << (28 - PAGE_SHIFT))
#define IE31200_MAD_DIMM_0_OFFSET 0x5004
#define IE31200_MAD_DIMM_0_OFFSET_SKL 0x500C
#define IE31200_MAD_DIMM_SIZE GENMASK_ULL(7, 0)
#define IE31200_MAD_DIMM_A_RANK BIT(17)
#define IE31200_MAD_DIMM_A_RANK_SHIFT 17
#define IE31200_MAD_DIMM_A_RANK_SKL BIT(10)
#define IE31200_MAD_DIMM_A_RANK_SKL_SHIFT 10
#define IE31200_MAD_DIMM_A_WIDTH BIT(19)
#define IE31200_MAD_DIMM_A_WIDTH_SHIFT 19
#define IE31200_MAD_DIMM_A_WIDTH_SKL GENMASK_ULL(9, 8)
#define IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT 8
/* Skylake reports 1GB increments, everything else is 256MB */
#define IE31200_PAGES(n, skl) \
(n << (28 + (2 * skl) - PAGE_SHIFT))
static int nr_channels;
struct ie31200_priv {
void __iomem *window;
void __iomem *c0errlog;
void __iomem *c1errlog;
};
enum ie31200_chips {
......@@ -157,9 +173,9 @@ static const struct ie31200_dev_info ie31200_devs[] = {
};
struct dimm_data {
u8 size; /* in 256MB multiples */
u8 size; /* in multiples of 256MB, except Skylake is 1GB */
u8 dual_rank : 1,
x16_width : 1; /* 0 means x8 width */
x16_width : 2; /* 0 means x8 width */
};
static int how_many_channels(struct pci_dev *pdev)
......@@ -197,11 +213,10 @@ static bool ecc_capable(struct pci_dev *pdev)
return true;
}
static int eccerrlog_row(int channel, u64 log)
static int eccerrlog_row(u64 log)
{
int rank = ((log & IE31200_ECCERRLOG_RANK_BITS) >>
IE31200_ECCERRLOG_RANK_SHIFT);
return rank | (channel * IE31200_RANKS_PER_CHANNEL);
return ((log & IE31200_ECCERRLOG_RANK_BITS) >>
IE31200_ECCERRLOG_RANK_SHIFT);
}
static void ie31200_clear_error_info(struct mem_ctl_info *mci)
......@@ -219,7 +234,6 @@ static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci,
{
struct pci_dev *pdev;
struct ie31200_priv *priv = mci->pvt_info;
void __iomem *window = priv->window;
pdev = to_pci_dev(mci->pdev);
......@@ -232,9 +246,9 @@ static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci,
if (!(info->errsts & IE31200_ERRSTS_BITS))
return;
info->eccerrlog[0] = lo_hi_readq(window + IE31200_C0ECCERRLOG);
info->eccerrlog[0] = lo_hi_readq(priv->c0errlog);
if (nr_channels == 2)
info->eccerrlog[1] = lo_hi_readq(window + IE31200_C1ECCERRLOG);
info->eccerrlog[1] = lo_hi_readq(priv->c1errlog);
pci_read_config_word(pdev, IE31200_ERRSTS, &info->errsts2);
......@@ -245,10 +259,10 @@ static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci,
* should be UE info.
*/
if ((info->errsts ^ info->errsts2) & IE31200_ERRSTS_BITS) {
info->eccerrlog[0] = lo_hi_readq(window + IE31200_C0ECCERRLOG);
info->eccerrlog[0] = lo_hi_readq(priv->c0errlog);
if (nr_channels == 2)
info->eccerrlog[1] =
lo_hi_readq(window + IE31200_C1ECCERRLOG);
lo_hi_readq(priv->c1errlog);
}
ie31200_clear_error_info(mci);
......@@ -274,14 +288,14 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
if (log & IE31200_ECCERRLOG_UE) {
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
0, 0, 0,
eccerrlog_row(channel, log),
eccerrlog_row(log),
channel, -1,
"ie31200 UE", "");
} else if (log & IE31200_ECCERRLOG_CE) {
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
0, 0,
IE31200_ECCERRLOG_SYNDROME(log),
eccerrlog_row(channel, log),
eccerrlog_row(log),
channel, -1,
"ie31200 CE", "");
}
......@@ -326,6 +340,33 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev)
return window;
}
static void __skl_populate_dimm_info(struct dimm_data *dd, u32 addr_decode,
int chan)
{
dd->size = (addr_decode >> (chan << 4)) & IE31200_MAD_DIMM_SIZE;
dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK_SKL << (chan << 4))) ? 1 : 0;
dd->x16_width = ((addr_decode & (IE31200_MAD_DIMM_A_WIDTH_SKL << (chan << 4))) >>
(IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT + (chan << 4)));
}
static void __populate_dimm_info(struct dimm_data *dd, u32 addr_decode,
int chan)
{
dd->size = (addr_decode >> (chan << 3)) & IE31200_MAD_DIMM_SIZE;
dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK << chan)) ? 1 : 0;
dd->x16_width = (addr_decode & (IE31200_MAD_DIMM_A_WIDTH << chan)) ? 1 : 0;
}
static void populate_dimm_info(struct dimm_data *dd, u32 addr_decode, int chan,
bool skl)
{
if (skl)
__skl_populate_dimm_info(dd, addr_decode, chan);
else
__populate_dimm_info(dd, addr_decode, chan);
}
static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
{
int i, j, ret;
......@@ -334,7 +375,8 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL];
void __iomem *window;
struct ie31200_priv *priv;
u32 addr_decode;
u32 addr_decode, mad_offset;
bool skl = (pdev->device == PCI_DEVICE_ID_INTEL_IE31200_HB_8);
edac_dbg(0, "MC:\n");
......@@ -363,7 +405,10 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
edac_dbg(3, "MC: init mci\n");
mci->pdev = &pdev->dev;
mci->mtype_cap = MEM_FLAG_DDR3;
if (skl)
mci->mtype_cap = MEM_FLAG_DDR4;
else
mci->mtype_cap = MEM_FLAG_DDR3;
mci->edac_ctl_cap = EDAC_FLAG_SECDED;
mci->edac_cap = EDAC_FLAG_SECDED;
mci->mod_name = EDAC_MOD_STR;
......@@ -374,19 +419,24 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
mci->ctl_page_to_phys = NULL;
priv = mci->pvt_info;
priv->window = window;
if (skl) {
priv->c0errlog = window + IE31200_C0ECCERRLOG_SKL;
priv->c1errlog = window + IE31200_C1ECCERRLOG_SKL;
mad_offset = IE31200_MAD_DIMM_0_OFFSET_SKL;
} else {
priv->c0errlog = window + IE31200_C0ECCERRLOG;
priv->c1errlog = window + IE31200_C1ECCERRLOG;
mad_offset = IE31200_MAD_DIMM_0_OFFSET;
}
/* populate DIMM info */
for (i = 0; i < IE31200_CHANNELS; i++) {
addr_decode = readl(window + IE31200_MAD_DIMM_0_OFFSET +
addr_decode = readl(window + mad_offset +
(i * 4));
edac_dbg(0, "addr_decode: 0x%x\n", addr_decode);
for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) {
dimm_info[i][j].size = (addr_decode >> (j * 8)) &
IE31200_MAD_DIMM_SIZE;
dimm_info[i][j].dual_rank = (addr_decode &
(IE31200_MAD_DIMM_A_RANK << j)) ? 1 : 0;
dimm_info[i][j].x16_width = (addr_decode &
(IE31200_MAD_DIMM_A_WIDTH << j)) ? 1 : 0;
populate_dimm_info(&dimm_info[i][j], addr_decode, j,
skl);
edac_dbg(0, "size: 0x%x, rank: %d, width: %d\n",
dimm_info[i][j].size,
dimm_info[i][j].dual_rank,
......@@ -405,7 +455,7 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
struct dimm_info *dimm;
unsigned long nr_pages;
nr_pages = IE31200_PAGES(dimm_info[j][i].size);
nr_pages = IE31200_PAGES(dimm_info[j][i].size, skl);
if (nr_pages == 0)
continue;
......@@ -417,7 +467,10 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* just a guess */
dimm->mtype = MEM_DDR3;
if (skl)
dimm->mtype = MEM_DDR4;
else
dimm->mtype = MEM_DDR3;
dimm->dtype = DEV_UNKNOWN;
dimm->edac_mode = EDAC_UNKNOWN;
}
......@@ -426,7 +479,10 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
dimm->nr_pages = nr_pages;
edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages);
dimm->grain = 8; /* same guess */
dimm->mtype = MEM_DDR3;
if (skl)
dimm->mtype = MEM_DDR4;
else
dimm->mtype = MEM_DDR3;
dimm->dtype = DEV_UNKNOWN;
dimm->edac_mode = EDAC_UNKNOWN;
}
......@@ -500,6 +556,9 @@ static const struct pci_device_id ie31200_pci_tbl[] = {
{
PCI_VEND_DEV(INTEL, IE31200_HB_7), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
IE31200},
{
PCI_VEND_DEV(INTEL, IE31200_HB_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
IE31200},
{
0,
} /* 0 terminated list. */
......
......@@ -21,6 +21,8 @@
#include <linux/smp.h>
#include <linux/bitmap.h>
#include <linux/math64.h>
#include <linux/mod_devicetable.h>
#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/mce.h>
......@@ -28,8 +30,6 @@
/* Static vars */
static LIST_HEAD(sbridge_edac_list);
static DEFINE_MUTEX(sbridge_edac_lock);
static int probed;
/*
* Alter this version for the module when modifications are made
......@@ -364,16 +364,6 @@ struct sbridge_pvt {
bool is_mirrored, is_lockstep, is_close_pg;
bool is_chan_hash;
/* Fifo double buffers */
struct mce mce_entry[MCE_LOG_LEN];
struct mce mce_outentry[MCE_LOG_LEN];
/* Fifo in/out counters */
unsigned mce_in, mce_out;
/* Count indicator to show errors not got */
unsigned mce_overrun;
/* Memory description */
u64 tolm, tohm;
struct knl_pvt knl;
......@@ -662,18 +652,6 @@ static const struct pci_id_table pci_dev_descr_broadwell_table[] = {
{0,} /* 0 terminated list. */
};
/*
* pci_device_id table for which devices we are looking for
*/
static const struct pci_device_id sbridge_pci_tbl[] = {
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0)},
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA)},
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0)},
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0)},
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0)},
{0,} /* 0 terminated list. */
};
/****************************************************************************
Ancillary status routines
......@@ -3097,63 +3075,8 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
}
/*
* sbridge_check_error Retrieve and process errors reported by the
* hardware. Called by the Core module.
*/
static void sbridge_check_error(struct mem_ctl_info *mci)
{
struct sbridge_pvt *pvt = mci->pvt_info;
int i;
unsigned count = 0;
struct mce *m;
/*
* MCE first step: Copy all mce errors into a temporary buffer
* We use a double buffering here, to reduce the risk of
* loosing an error.
*/
smp_rmb();
count = (pvt->mce_out + MCE_LOG_LEN - pvt->mce_in)
% MCE_LOG_LEN;
if (!count)
return;
m = pvt->mce_outentry;
if (pvt->mce_in + count > MCE_LOG_LEN) {
unsigned l = MCE_LOG_LEN - pvt->mce_in;
memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * l);
smp_wmb();
pvt->mce_in = 0;
count -= l;
m += l;
}
memcpy(m, &pvt->mce_entry[pvt->mce_in], sizeof(*m) * count);
smp_wmb();
pvt->mce_in += count;
smp_rmb();
if (pvt->mce_overrun) {
sbridge_printk(KERN_ERR, "Lost %d memory errors\n",
pvt->mce_overrun);
smp_wmb();
pvt->mce_overrun = 0;
}
/*
* MCE second step: parse errors and display
*/
for (i = 0; i < count; i++)
sbridge_mce_output_error(mci, &pvt->mce_outentry[i]);
}
/*
* sbridge_mce_check_error Replicates mcelog routine to get errors
* This routine simply queues mcelog errors, and
* return. The error itself should be handled later
* by sbridge_check_error.
* WARNING: As this routine should be called at NMI time, extra care should
* be taken to avoid deadlocks, and to be as fast as possible.
* Check that logging is enabled and that this is the right type
* of error for us to handle.
*/
static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
void *data)
......@@ -3198,21 +3121,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
"%u APIC %x\n", mce->cpuvendor, mce->cpuid,
mce->time, mce->socketid, mce->apicid);
smp_rmb();
if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
smp_wmb();
pvt->mce_overrun++;
return NOTIFY_DONE;
}
/* Copy memory error at the ringbuffer */
memcpy(&pvt->mce_entry[pvt->mce_out], mce, sizeof(*mce));
smp_wmb();
pvt->mce_out = (pvt->mce_out + 1) % MCE_LOG_LEN;
/* Handle fatal errors immediately */
if (mce->mcgstatus & 1)
sbridge_check_error(mci);
sbridge_mce_output_error(mci, mce);
/* Advice mcelog that the error were handled */
return NOTIFY_STOP;
......@@ -3298,9 +3207,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
mci->dev_name = pci_name(pdev);
mci->ctl_page_to_phys = NULL;
/* Set the function pointer to an actual operation function */
mci->edac_check = sbridge_check_error;
pvt->info.type = type;
switch (type) {
case IVY_BRIDGE:
......@@ -3448,62 +3354,40 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
return rc;
}
#define ICPU(model, table) \
{ X86_VENDOR_INTEL, 6, model, 0, (unsigned long)&table }
/* Order here must match "enum type" */
static const struct x86_cpu_id sbridge_cpuids[] = {
ICPU(0x2d, pci_dev_descr_sbridge_table), /* SANDY_BRIDGE */
ICPU(0x3e, pci_dev_descr_ibridge_table), /* IVY_BRIDGE */
ICPU(0x3f, pci_dev_descr_haswell_table), /* HASWELL */
ICPU(0x4f, pci_dev_descr_broadwell_table), /* BROADWELL */
ICPU(0x57, pci_dev_descr_knl_table), /* KNIGHTS_LANDING */
{ }
};
MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids);
/*
* sbridge_probe Probe for ONE instance of device to see if it is
* sbridge_probe Get all devices and register memory controllers
* present.
* return:
* 0 for FOUND a device
* < 0 for error code
*/
static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static int sbridge_probe(const struct x86_cpu_id *id)
{
int rc = -ENODEV;
u8 mc, num_mc = 0;
struct sbridge_dev *sbridge_dev;
enum type type = SANDY_BRIDGE;
struct pci_id_table *ptable = (struct pci_id_table *)id->driver_data;
/* get the pci devices we want to reserve for our use */
mutex_lock(&sbridge_edac_lock);
/*
* All memory controllers are allocated at the first pass.
*/
if (unlikely(probed >= 1)) {
mutex_unlock(&sbridge_edac_lock);
return -ENODEV;
}
probed++;
rc = sbridge_get_all_devices(&num_mc, ptable);
switch (pdev->device) {
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
rc = sbridge_get_all_devices(&num_mc,
pci_dev_descr_ibridge_table);
type = IVY_BRIDGE;
break;
case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0:
rc = sbridge_get_all_devices(&num_mc,
pci_dev_descr_sbridge_table);
type = SANDY_BRIDGE;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0:
rc = sbridge_get_all_devices(&num_mc,
pci_dev_descr_haswell_table);
type = HASWELL;
break;
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0:
rc = sbridge_get_all_devices(&num_mc,
pci_dev_descr_broadwell_table);
type = BROADWELL;
break;
case PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0:
rc = sbridge_get_all_devices_knl(&num_mc,
pci_dev_descr_knl_table);
type = KNIGHTS_LANDING;
break;
}
if (unlikely(rc < 0)) {
edac_dbg(0, "couldn't get all devices for 0x%x\n", pdev->device);
edac_dbg(0, "couldn't get all devices\n");
goto fail0;
}
......@@ -3514,14 +3398,13 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id)
mc, mc + 1, num_mc);
sbridge_dev->mc = mc++;
rc = sbridge_register_mci(sbridge_dev, type);
rc = sbridge_register_mci(sbridge_dev, id - sbridge_cpuids);
if (unlikely(rc < 0))
goto fail1;
}
sbridge_printk(KERN_INFO, "%s\n", SBRIDGE_REVISION);
mutex_unlock(&sbridge_edac_lock);
return 0;
fail1:
......@@ -3530,74 +3413,47 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id)
sbridge_put_all_devices();
fail0:
mutex_unlock(&sbridge_edac_lock);
return rc;
}
/*
* sbridge_remove destructor for one instance of device
* sbridge_remove cleanup
*
*/
static void sbridge_remove(struct pci_dev *pdev)
static void sbridge_remove(void)
{
struct sbridge_dev *sbridge_dev;
edac_dbg(0, "\n");
/*
* we have a trouble here: pdev value for removal will be wrong, since
* it will point to the X58 register used to detect that the machine
* is a Nehalem or upper design. However, due to the way several PCI
* devices are grouped together to provide MC functionality, we need
* to use a different method for releasing the devices
*/
mutex_lock(&sbridge_edac_lock);
if (unlikely(!probed)) {
mutex_unlock(&sbridge_edac_lock);
return;
}
list_for_each_entry(sbridge_dev, &sbridge_edac_list, list)
sbridge_unregister_mci(sbridge_dev);
/* Release PCI resources */
sbridge_put_all_devices();
probed--;
mutex_unlock(&sbridge_edac_lock);
}
MODULE_DEVICE_TABLE(pci, sbridge_pci_tbl);
/*
* sbridge_driver pci_driver structure for this module
*
*/
static struct pci_driver sbridge_driver = {
.name = "sbridge_edac",
.probe = sbridge_probe,
.remove = sbridge_remove,
.id_table = sbridge_pci_tbl,
};
/*
* sbridge_init Module entry function
* Try to initialize this module for its devices
*/
static int __init sbridge_init(void)
{
int pci_rc;
const struct x86_cpu_id *id;
int rc;
edac_dbg(2, "\n");
id = x86_match_cpu(sbridge_cpuids);
if (!id)
return -ENODEV;
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init();
pci_rc = pci_register_driver(&sbridge_driver);
if (pci_rc >= 0) {
rc = sbridge_probe(id);
if (rc >= 0) {
mce_register_decode_chain(&sbridge_mce_dec);
if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
......@@ -3605,9 +3461,9 @@ static int __init sbridge_init(void)
}
sbridge_printk(KERN_ERR, "Failed to register device with error %d.\n",
pci_rc);
rc);
return pci_rc;
return rc;
}
/*
......@@ -3617,7 +3473,7 @@ static int __init sbridge_init(void)
static void __exit sbridge_exit(void)
{
edac_dbg(2, "\n");
pci_unregister_driver(&sbridge_driver);
sbridge_remove();
mce_unregister_decode_chain(&sbridge_mce_dec);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment