Commit 56ca4fde authored by Gavin Shan's avatar Gavin Shan Committed by Benjamin Herrenschmidt

powerpc/eeh: Refactor the output message

We needn't the the whole backtrace other than one-line message in
the error reporting interrupt handler. For errors triggered by
access PCI config space or MMIO, we replace "WARN(1, ...)" with
pr_err() and dump_stack(). The patch also adds more output messages
to indicate what EEH core is doing. Besides, some printk() are
replaced with pr_warning().
Signed-off-by: default avatarGavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 88b6d14b
...@@ -329,7 +329,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) ...@@ -329,7 +329,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
eeh_serialize_unlock(flags); eeh_serialize_unlock(flags);
eeh_send_failure_event(phb_pe); eeh_send_failure_event(phb_pe);
WARN(1, "EEH: PHB failure detected\n"); pr_err("EEH: PHB#%x failure detected\n",
phb_pe->phb->global_number);
dump_stack();
return 1; return 1;
out: out:
...@@ -458,7 +460,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev) ...@@ -458,7 +460,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
* a stack trace will help the device-driver authors figure * a stack trace will help the device-driver authors figure
* out what happened. So print that out. * out what happened. So print that out.
*/ */
WARN(1, "EEH: failure detected\n"); pr_err("EEH: Frozen PE#%x detected on PHB#%x\n",
pe->addr, pe->phb->global_number);
dump_stack();
return 1; return 1;
dn_unlock: dn_unlock:
......
...@@ -425,6 +425,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -425,6 +425,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
* status ... if any child can't handle the reset, then the entire * status ... if any child can't handle the reset, then the entire
* slot is dlpar removed and added. * slot is dlpar removed and added.
*/ */
pr_info("EEH: Notify device drivers to shutdown\n");
eeh_pe_dev_traverse(pe, eeh_report_error, &result); eeh_pe_dev_traverse(pe, eeh_report_error, &result);
/* Get the current PCI slot state. This can take a long time, /* Get the current PCI slot state. This can take a long time,
...@@ -432,7 +433,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -432,7 +433,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
*/ */
rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
printk(KERN_WARNING "EEH: Permanent failure\n"); pr_warning("EEH: Permanent failure\n");
goto hard_fail; goto hard_fail;
} }
...@@ -440,6 +441,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -440,6 +441,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
* don't post the error log until after all dev drivers * don't post the error log until after all dev drivers
* have been informed. * have been informed.
*/ */
pr_info("EEH: Collect temporary log\n");
eeh_slot_error_detail(pe, EEH_LOG_TEMP); eeh_slot_error_detail(pe, EEH_LOG_TEMP);
/* If all device drivers were EEH-unaware, then shut /* If all device drivers were EEH-unaware, then shut
...@@ -447,15 +449,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -447,15 +449,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
* go down willingly, without panicing the system. * go down willingly, without panicing the system.
*/ */
if (result == PCI_ERS_RESULT_NONE) { if (result == PCI_ERS_RESULT_NONE) {
pr_info("EEH: Reset with hotplug activity\n");
rc = eeh_reset_device(pe, frozen_bus); rc = eeh_reset_device(pe, frozen_bus);
if (rc) { if (rc) {
printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); pr_warning("%s: Unable to reset, err=%d\n",
__func__, rc);
goto hard_fail; goto hard_fail;
} }
} }
/* If all devices reported they can proceed, then re-enable MMIO */ /* If all devices reported they can proceed, then re-enable MMIO */
if (result == PCI_ERS_RESULT_CAN_RECOVER) { if (result == PCI_ERS_RESULT_CAN_RECOVER) {
pr_info("EEH: Enable I/O for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
if (rc < 0) if (rc < 0)
...@@ -463,6 +468,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -463,6 +468,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
if (rc) { if (rc) {
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
} else { } else {
pr_info("EEH: Notify device drivers to resume I/O\n");
result = PCI_ERS_RESULT_NONE; result = PCI_ERS_RESULT_NONE;
eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
} }
...@@ -470,6 +476,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -470,6 +476,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
/* If all devices reported they can proceed, then re-enable DMA */ /* If all devices reported they can proceed, then re-enable DMA */
if (result == PCI_ERS_RESULT_CAN_RECOVER) { if (result == PCI_ERS_RESULT_CAN_RECOVER) {
pr_info("EEH: Enabled DMA for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
if (rc < 0) if (rc < 0)
...@@ -482,17 +489,22 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -482,17 +489,22 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
/* If any device has a hard failure, then shut off everything. */ /* If any device has a hard failure, then shut off everything. */
if (result == PCI_ERS_RESULT_DISCONNECT) { if (result == PCI_ERS_RESULT_DISCONNECT) {
printk(KERN_WARNING "EEH: Device driver gave up\n"); pr_warning("EEH: Device driver gave up\n");
goto hard_fail; goto hard_fail;
} }
/* If any device called out for a reset, then reset the slot */ /* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) { if (result == PCI_ERS_RESULT_NEED_RESET) {
pr_info("EEH: Reset without hotplug activity\n");
rc = eeh_reset_device(pe, NULL); rc = eeh_reset_device(pe, NULL);
if (rc) { if (rc) {
printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); pr_warning("%s: Cannot reset, err=%d\n",
__func__, rc);
goto hard_fail; goto hard_fail;
} }
pr_info("EEH: Notify device drivers "
"the completion of reset\n");
result = PCI_ERS_RESULT_NONE; result = PCI_ERS_RESULT_NONE;
eeh_pe_dev_traverse(pe, eeh_report_reset, &result); eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
} }
...@@ -500,11 +512,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -500,11 +512,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
/* All devices should claim they have recovered by now. */ /* All devices should claim they have recovered by now. */
if ((result != PCI_ERS_RESULT_RECOVERED) && if ((result != PCI_ERS_RESULT_RECOVERED) &&
(result != PCI_ERS_RESULT_NONE)) { (result != PCI_ERS_RESULT_NONE)) {
printk(KERN_WARNING "EEH: Not recovered\n"); pr_warning("EEH: Not recovered\n");
goto hard_fail; goto hard_fail;
} }
/* Tell all device drivers that they can resume operations */ /* Tell all device drivers that they can resume operations */
pr_info("EEH: Notify device driver to resume\n");
eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
return; return;
......
...@@ -853,11 +853,14 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) ...@@ -853,11 +853,14 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
phb->eeh_state |= PNV_EEH_STATE_REMOVED; phb->eeh_state |= PNV_EEH_STATE_REMOVED;
} }
WARN(1, "EEH: dead IOC detected\n"); pr_err("EEH: dead IOC detected\n");
ret = 4; ret = 4;
goto out; goto out;
} else if (severity == OPAL_EEH_SEV_INF) } else if (severity == OPAL_EEH_SEV_INF) {
pr_info("EEH: IOC informative error "
"detected\n");
ioda_eeh_hub_diag(hose); ioda_eeh_hub_diag(hose);
}
break; break;
case OPAL_EEH_PHB_ERROR: case OPAL_EEH_PHB_ERROR:
...@@ -865,8 +868,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) ...@@ -865,8 +868,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
if (ioda_eeh_get_phb_pe(hose, pe)) if (ioda_eeh_get_phb_pe(hose, pe))
break; break;
WARN(1, "EEH: dead PHB#%x detected\n", pr_err("EEH: dead PHB#%x detected\n",
hose->global_number); hose->global_number);
phb->eeh_state |= PNV_EEH_STATE_REMOVED; phb->eeh_state |= PNV_EEH_STATE_REMOVED;
ret = 3; ret = 3;
goto out; goto out;
...@@ -874,20 +877,24 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) ...@@ -874,20 +877,24 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
if (ioda_eeh_get_phb_pe(hose, pe)) if (ioda_eeh_get_phb_pe(hose, pe))
break; break;
WARN(1, "EEH: fenced PHB#%x detected\n", pr_err("EEH: fenced PHB#%x detected\n",
hose->global_number); hose->global_number);
ret = 2; ret = 2;
goto out; goto out;
} else if (severity == OPAL_EEH_SEV_INF) } else if (severity == OPAL_EEH_SEV_INF) {
pr_info("EEH: PHB#%x informative error "
"detected\n",
hose->global_number);
ioda_eeh_phb_diag(hose); ioda_eeh_phb_diag(hose);
}
break; break;
case OPAL_EEH_PE_ERROR: case OPAL_EEH_PE_ERROR:
if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
break; break;
WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n", pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
(*pe)->addr, (*pe)->phb->global_number); (*pe)->addr, (*pe)->phb->global_number);
ret = 1; ret = 1;
goto out; goto out;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment