Commit fc62b3fc authored by Sumit.Saxena@avagotech.com's avatar Sumit.Saxena@avagotech.com Committed by Christoph Hellwig

megaraid_sas : Firmware crash dump feature support

Resending the patch. Addressed the review comments from Tomas Henzl.
Move buff_offset inside spinlock, corrected loop at crash dump buffer free,
reset_devices check is added to disable fw crash dump feature in kdump kernel.

This feature will provide similar interface as kernel crash dump feature.
When megaraid firmware encounter any crash, driver will collect the firmware raw image and
dump it into pre-configured location.

Driver will allocate two different segment of memory.
#1 Non-DMA able large buffer (will be allocated on demand) to capture actual FW crash dump.
#2 DMA buffer (persistence allocation) just to do a arbitrator job.

Firmware will keep writing Crash dump data in chucks of DMA buffer size into #2,
which will be copy back by driver to the host memory as described in #1.

Driver-Firmware interface:
==================
A.) Host driver can allocate maximum 512MB Host memory to store crash dump data.

This memory will be internal to the host and will not be exposed to the Firmware.
Driver may not be able to allocate 512 MB. In that case, driver will do possible memory
(available at run time) allocation to store crash dump data.

Let’s call this buffer as Host Crash Buffer.

Host Crash buffer will not be contigious as a whole, but it will have multiple chunk of contigious memory.
This will be internal to driver and firmware/application are unaware of it.
Partial allocation of Host Crash buffer may have valid information to debug depending upon
what was collected in that buffer and depending on nature of failure.

Complete Crash dump is the best case, but we do want to capture partial buffer just to grab something rather than nothing.
Host Crash buffer will be allocated only when FW Crash dump data is available,
and will be deallocated once application copy Host Crash buffer to the file.
Host Crash buffer size can be anything between 1MB to 512MB. (It will be multiple of 1MBs)

B.) Irrespective of underlying Firmware capability of crash dump support,
driver will allocate DMA buffer at start of the day for each MR controllers.
Let’s call this buffer as “DMA Crash Buffer”.

For this feature, size of DMA crash buffer will be 1MB.
(We will not gain much even if DMA buffer size is increased.)

C.) Driver will now read Controller Info sending existing dcmd “MR_DCMD_CTRL_GET_INFO”.
Driver should extract the information from ctrl info provided by firmware and
figure out if firmware support crash dump feature or not.

Driver will enable crash dump feature only if
“Firmware support Crash dump” +
“Driver was able to create DMA Crash Buffer”.

If either one from above is not set, Crash dump feature should be disable in driver.
Firmware will enable crash dump feature only if “Driver Send DCMD- MR_DCMD_SET_CRASH_BUF_PARA with MR_CRASH_BUF_TURN_ON”

Helper application/script should use sysfs parameter fw_crash_xxx to actually copy data from
host memory to the filesystem.
Signed-off-by: default avatarSumit Saxena <sumit.saxena@avagotech.com>
Signed-off-by: default avatarKashyap Desai <kashyap.desai@avagotech.com>
Reviewed-by: default avatarTomas Henzl <thenzl@redhat.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent db4fc864
...@@ -105,6 +105,9 @@ ...@@ -105,6 +105,9 @@
#define MFI_STATE_READY 0xB0000000 #define MFI_STATE_READY 0xB0000000
#define MFI_STATE_OPERATIONAL 0xC0000000 #define MFI_STATE_OPERATIONAL 0xC0000000
#define MFI_STATE_FAULT 0xF0000000 #define MFI_STATE_FAULT 0xF0000000
#define MFI_STATE_FORCE_OCR 0x00000080
#define MFI_STATE_DMADONE 0x00000008
#define MFI_STATE_CRASH_DUMP_DONE 0x00000004
#define MFI_RESET_REQUIRED 0x00000001 #define MFI_RESET_REQUIRED 0x00000001
#define MFI_RESET_ADAPTER 0x00000002 #define MFI_RESET_ADAPTER 0x00000002
#define MEGAMFI_FRAME_SIZE 64 #define MEGAMFI_FRAME_SIZE 64
...@@ -191,6 +194,9 @@ ...@@ -191,6 +194,9 @@
#define MR_DCMD_CLUSTER_RESET_LD 0x08010200 #define MR_DCMD_CLUSTER_RESET_LD 0x08010200
#define MR_DCMD_PD_LIST_QUERY 0x02010100 #define MR_DCMD_PD_LIST_QUERY 0x02010100
#define MR_DCMD_CTRL_SET_CRASH_DUMP_PARAMS 0x01190100
#define MR_DRIVER_SET_APP_CRASHDUMP_MODE (0xF0010000 | 0x0600)
/* /*
* Global functions * Global functions
*/ */
...@@ -263,6 +269,25 @@ enum MFI_STAT { ...@@ -263,6 +269,25 @@ enum MFI_STAT {
MFI_STAT_INVALID_STATUS = 0xFF MFI_STAT_INVALID_STATUS = 0xFF
}; };
/*
* Crash dump related defines
*/
#define MAX_CRASH_DUMP_SIZE 512
#define CRASH_DMA_BUF_SIZE (1024 * 1024)
enum MR_FW_CRASH_DUMP_STATE {
UNAVAILABLE = 0,
AVAILABLE = 1,
COPYING = 2,
COPIED = 3,
COPY_ERROR = 4,
};
enum _MR_CRASH_BUF_STATUS {
MR_CRASH_BUF_TURN_OFF = 0,
MR_CRASH_BUF_TURN_ON = 1,
};
/* /*
* Number of mailbox bytes in DCMD message frame * Number of mailbox bytes in DCMD message frame
*/ */
...@@ -933,7 +958,19 @@ struct megasas_ctrl_info { ...@@ -933,7 +958,19 @@ struct megasas_ctrl_info {
u8 reserved; /*0x7E7*/ u8 reserved; /*0x7E7*/
} iov; } iov;
u8 pad[0x800-0x7E8]; /*0x7E8 pad to 2k */ struct {
#if defined(__BIG_ENDIAN_BITFIELD)
u32 reserved:25;
u32 supportCrashDump:1;
u32 reserved1:6;
#else
u32 reserved1:6;
u32 supportCrashDump:1;
u32 reserved:25;
#endif
} adapterOperations3;
u8 pad[0x800-0x7EC];
} __packed; } __packed;
/* /*
...@@ -1559,6 +1596,20 @@ struct megasas_instance { ...@@ -1559,6 +1596,20 @@ struct megasas_instance {
u32 *reply_queue; u32 *reply_queue;
dma_addr_t reply_queue_h; dma_addr_t reply_queue_h;
u32 *crash_dump_buf;
dma_addr_t crash_dump_h;
void *crash_buf[MAX_CRASH_DUMP_SIZE];
u32 crash_buf_pages;
unsigned int fw_crash_buffer_size;
unsigned int fw_crash_state;
unsigned int fw_crash_buffer_offset;
u32 drv_buf_index;
u32 drv_buf_alloc;
u32 crash_dump_fw_support;
u32 crash_dump_drv_support;
u32 crash_dump_app_support;
spinlock_t crashdump_lock;
struct megasas_register_set __iomem *reg_set; struct megasas_register_set __iomem *reg_set;
u32 *reply_post_host_index_addr[MR_MAX_MSIX_REG_ARRAY]; u32 *reply_post_host_index_addr[MR_MAX_MSIX_REG_ARRAY];
struct megasas_pd_list pd_list[MEGASAS_MAX_PD]; struct megasas_pd_list pd_list[MEGASAS_MAX_PD];
...@@ -1606,6 +1657,7 @@ struct megasas_instance { ...@@ -1606,6 +1657,7 @@ struct megasas_instance {
struct megasas_instance_template *instancet; struct megasas_instance_template *instancet;
struct tasklet_struct isr_tasklet; struct tasklet_struct isr_tasklet;
struct work_struct work_init; struct work_struct work_init;
struct work_struct crash_init;
u8 flag; u8 flag;
u8 unload; u8 unload;
...@@ -1830,4 +1882,8 @@ u16 MR_LdSpanArrayGet(u32 ld, u32 span, struct MR_FW_RAID_MAP_ALL *map); ...@@ -1830,4 +1882,8 @@ u16 MR_LdSpanArrayGet(u32 ld, u32 span, struct MR_FW_RAID_MAP_ALL *map);
u16 MR_PdDevHandleGet(u32 pd, struct MR_FW_RAID_MAP_ALL *map); u16 MR_PdDevHandleGet(u32 pd, struct MR_FW_RAID_MAP_ALL *map);
u16 MR_GetLDTgtId(u32 ld, struct MR_FW_RAID_MAP_ALL *map); u16 MR_GetLDTgtId(u32 ld, struct MR_FW_RAID_MAP_ALL *map);
int megasas_set_crash_dump_params(struct megasas_instance *instance,
u8 crash_buf_state);
void megasas_free_host_crash_buffer(struct megasas_instance *instance);
void megasas_fusion_crash_dump_wq(struct work_struct *work);
#endif /*LSI_MEGARAID_SAS_H */ #endif /*LSI_MEGARAID_SAS_H */
This diff is collapsed.
...@@ -91,6 +91,8 @@ void megasas_start_timer(struct megasas_instance *instance, ...@@ -91,6 +91,8 @@ void megasas_start_timer(struct megasas_instance *instance,
extern struct megasas_mgmt_info megasas_mgmt_info; extern struct megasas_mgmt_info megasas_mgmt_info;
extern int resetwaittime; extern int resetwaittime;
/** /**
* megasas_enable_intr_fusion - Enables interrupts * megasas_enable_intr_fusion - Enables interrupts
* @regs: MFI register set * @regs: MFI register set
...@@ -2055,7 +2057,7 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp) ...@@ -2055,7 +2057,7 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
{ {
struct megasas_irq_context *irq_context = devp; struct megasas_irq_context *irq_context = devp;
struct megasas_instance *instance = irq_context->instance; struct megasas_instance *instance = irq_context->instance;
u32 mfiStatus, fw_state; u32 mfiStatus, fw_state, dma_state;
if (instance->mask_interrupts) if (instance->mask_interrupts)
return IRQ_NONE; return IRQ_NONE;
...@@ -2077,7 +2079,16 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp) ...@@ -2077,7 +2079,16 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
/* If we didn't complete any commands, check for FW fault */ /* If we didn't complete any commands, check for FW fault */
fw_state = instance->instancet->read_fw_status_reg( fw_state = instance->instancet->read_fw_status_reg(
instance->reg_set) & MFI_STATE_MASK; instance->reg_set) & MFI_STATE_MASK;
if (fw_state == MFI_STATE_FAULT) { dma_state = instance->instancet->read_fw_status_reg
(instance->reg_set) & MFI_STATE_DMADONE;
if (instance->crash_dump_drv_support &&
instance->crash_dump_app_support) {
/* Start collecting crash, if DMA bit is done */
if ((fw_state == MFI_STATE_FAULT) && dma_state)
schedule_work(&instance->crash_init);
else if (fw_state == MFI_STATE_FAULT)
schedule_work(&instance->work_init);
} else if (fw_state == MFI_STATE_FAULT) {
printk(KERN_WARNING "megaraid_sas: Iop2SysDoorbellInt" printk(KERN_WARNING "megaraid_sas: Iop2SysDoorbellInt"
"for scsi%d\n", instance->host->host_no); "for scsi%d\n", instance->host->host_no);
schedule_work(&instance->work_init); schedule_work(&instance->work_init);
...@@ -2229,6 +2240,49 @@ megasas_read_fw_status_reg_fusion(struct megasas_register_set __iomem *regs) ...@@ -2229,6 +2240,49 @@ megasas_read_fw_status_reg_fusion(struct megasas_register_set __iomem *regs)
return readl(&(regs)->outbound_scratch_pad); return readl(&(regs)->outbound_scratch_pad);
} }
/**
* megasas_alloc_host_crash_buffer - Host buffers for Crash dump collection from Firmware
* @instance: Controller's soft instance
* return: Number of allocated host crash buffers
*/
static void
megasas_alloc_host_crash_buffer(struct megasas_instance *instance)
{
unsigned int i;
instance->crash_buf_pages = get_order(CRASH_DMA_BUF_SIZE);
for (i = 0; i < MAX_CRASH_DUMP_SIZE; i++) {
instance->crash_buf[i] = (void *)__get_free_pages(GFP_KERNEL,
instance->crash_buf_pages);
if (!instance->crash_buf[i]) {
dev_info(&instance->pdev->dev, "Firmware crash dump "
"memory allocation failed at index %d\n", i);
break;
}
}
instance->drv_buf_alloc = i;
}
/**
* megasas_free_host_crash_buffer - Host buffers for Crash dump collection from Firmware
* @instance: Controller's soft instance
*/
void
megasas_free_host_crash_buffer(struct megasas_instance *instance)
{
unsigned int i
;
for (i = 0; i < instance->drv_buf_alloc; i++) {
if (instance->crash_buf[i])
free_pages((ulong)instance->crash_buf[i],
instance->crash_buf_pages);
}
instance->drv_buf_index = 0;
instance->drv_buf_alloc = 0;
instance->fw_crash_state = UNAVAILABLE;
instance->fw_crash_buffer_size = 0;
}
/** /**
* megasas_adp_reset_fusion - For controller reset * megasas_adp_reset_fusion - For controller reset
* @regs: MFI register set * @regs: MFI register set
...@@ -2372,6 +2426,7 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout) ...@@ -2372,6 +2426,7 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
struct megasas_cmd *cmd_mfi; struct megasas_cmd *cmd_mfi;
union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc; union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
u32 host_diag, abs_state, status_reg, reset_adapter; u32 host_diag, abs_state, status_reg, reset_adapter;
u32 io_timeout_in_crash_mode = 0;
instance = (struct megasas_instance *)shost->hostdata; instance = (struct megasas_instance *)shost->hostdata;
fusion = instance->ctrl_context; fusion = instance->ctrl_context;
...@@ -2385,6 +2440,42 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout) ...@@ -2385,6 +2440,42 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
mutex_unlock(&instance->reset_mutex); mutex_unlock(&instance->reset_mutex);
return FAILED; return FAILED;
} }
status_reg = instance->instancet->read_fw_status_reg(instance->reg_set);
abs_state = status_reg & MFI_STATE_MASK;
/* IO timeout detected, forcibly put FW in FAULT state */
if (abs_state != MFI_STATE_FAULT && instance->crash_dump_buf &&
instance->crash_dump_app_support && iotimeout) {
dev_info(&instance->pdev->dev, "IO timeout is detected, "
"forcibly FAULT Firmware\n");
instance->adprecovery = MEGASAS_ADPRESET_SM_INFAULT;
status_reg = readl(&instance->reg_set->doorbell);
writel(status_reg | MFI_STATE_FORCE_OCR,
&instance->reg_set->doorbell);
readl(&instance->reg_set->doorbell);
mutex_unlock(&instance->reset_mutex);
do {
ssleep(3);
io_timeout_in_crash_mode++;
dev_dbg(&instance->pdev->dev, "waiting for [%d] "
"seconds for crash dump collection and OCR "
"to be done\n", (io_timeout_in_crash_mode * 3));
} while ((instance->adprecovery != MEGASAS_HBA_OPERATIONAL) &&
(io_timeout_in_crash_mode < 80));
if (instance->adprecovery == MEGASAS_HBA_OPERATIONAL) {
dev_info(&instance->pdev->dev, "OCR done for IO "
"timeout case\n");
retval = SUCCESS;
} else {
dev_info(&instance->pdev->dev, "Controller is not "
"operational after 240 seconds wait for IO "
"timeout case in FW crash dump mode\n do "
"OCR/kill adapter\n");
retval = megasas_reset_fusion(shost, 0);
}
return retval;
}
if (instance->requestorId && !instance->skip_heartbeat_timer_del) if (instance->requestorId && !instance->skip_heartbeat_timer_del)
del_timer_sync(&instance->sriov_heartbeat_timer); del_timer_sync(&instance->sriov_heartbeat_timer);
...@@ -2651,6 +2742,15 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout) ...@@ -2651,6 +2742,15 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
printk(KERN_WARNING "megaraid_sas: Reset " printk(KERN_WARNING "megaraid_sas: Reset "
"successful for scsi%d.\n", "successful for scsi%d.\n",
instance->host->host_no); instance->host->host_no);
if (instance->crash_dump_drv_support) {
if (instance->crash_dump_app_support)
megasas_set_crash_dump_params(instance,
MR_CRASH_BUF_TURN_ON);
else
megasas_set_crash_dump_params(instance,
MR_CRASH_BUF_TURN_OFF);
}
retval = SUCCESS; retval = SUCCESS;
goto out; goto out;
} }
...@@ -2679,6 +2779,74 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout) ...@@ -2679,6 +2779,74 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
return retval; return retval;
} }
/* Fusion Crash dump collection work queue */
void megasas_fusion_crash_dump_wq(struct work_struct *work)
{
struct megasas_instance *instance =
container_of(work, struct megasas_instance, crash_init);
u32 status_reg;
u8 partial_copy = 0;
status_reg = instance->instancet->read_fw_status_reg(instance->reg_set);
/*
* Allocate host crash buffers to copy data from 1 MB DMA crash buffer
* to host crash buffers
*/
if (instance->drv_buf_index == 0) {
/* Buffer is already allocated for old Crash dump.
* Do OCR and do not wait for crash dump collection
*/
if (instance->drv_buf_alloc) {
dev_info(&instance->pdev->dev, "earlier crash dump is "
"not yet copied by application, ignoring this "
"crash dump and initiating OCR\n");
status_reg |= MFI_STATE_CRASH_DUMP_DONE;
writel(status_reg,
&instance->reg_set->outbound_scratch_pad);
readl(&instance->reg_set->outbound_scratch_pad);
return;
}
megasas_alloc_host_crash_buffer(instance);
dev_info(&instance->pdev->dev, "Number of host crash buffers "
"allocated: %d\n", instance->drv_buf_alloc);
}
/*
* Driver has allocated max buffers, which can be allocated
* and FW has more crash dump data, then driver will
* ignore the data.
*/
if (instance->drv_buf_index >= (instance->drv_buf_alloc)) {
dev_info(&instance->pdev->dev, "Driver is done copying "
"the buffer: %d\n", instance->drv_buf_alloc);
status_reg |= MFI_STATE_CRASH_DUMP_DONE;
partial_copy = 1;
} else {
memcpy(instance->crash_buf[instance->drv_buf_index],
instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
instance->drv_buf_index++;
status_reg &= ~MFI_STATE_DMADONE;
}
if (status_reg & MFI_STATE_CRASH_DUMP_DONE) {
dev_info(&instance->pdev->dev, "Crash Dump is available,number "
"of copied buffers: %d\n", instance->drv_buf_index);
instance->fw_crash_buffer_size = instance->drv_buf_index;
instance->fw_crash_state = AVAILABLE;
instance->drv_buf_index = 0;
writel(status_reg, &instance->reg_set->outbound_scratch_pad);
readl(&instance->reg_set->outbound_scratch_pad);
if (!partial_copy)
megasas_reset_fusion(instance->host, 0);
} else {
writel(status_reg, &instance->reg_set->outbound_scratch_pad);
readl(&instance->reg_set->outbound_scratch_pad);
}
}
/* Fusion OCR work queue */ /* Fusion OCR work queue */
void megasas_fusion_ocr_wq(struct work_struct *work) void megasas_fusion_ocr_wq(struct work_struct *work)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment