Commit b90484ec authored by Sam Bobroff's avatar Sam Bobroff Committed by Michael Ellerman

powerpc/eeh: Cleanup control flow in eeh_handle_normal_event()

Rather than mixing "if (state)" blocks and gotos, convert entirely to
"if (state)" blocks to make the state machine behaviour clearer.
Signed-off-by: default avatarSam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent fef7f905
...@@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
pe->phb->global_number, pe->addr, pe->phb->global_number, pe->addr,
pe->freeze_count); pe->freeze_count);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} }
pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
pe->freeze_count, eeh_max_freezes);
/* Walk the various device drivers attached to this slot through /* Walk the various device drivers attached to this slot through
* a reset sequence, giving each an opportunity to do what it needs * a reset sequence, giving each an opportunity to do what it needs
...@@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
* the error. Override the result if necessary to have partially * the error. Override the result if necessary to have partially
* hotplug for this case. * hotplug for this case.
*/ */
pr_info("EEH: Notify device drivers to shutdown\n"); if (result != PCI_ERS_RESULT_DISCONNECT) {
eeh_set_channel_state(pe, pci_channel_io_frozen); pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
eeh_set_irq_state(pe, false); pe->freeze_count, eeh_max_freezes);
eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, pr_info("EEH: Notify device drivers to shutdown\n");
&result); eeh_set_channel_state(pe, pci_channel_io_frozen);
if ((pe->type & EEH_PE_PHB) && eeh_set_irq_state(pe, false);
result != PCI_ERS_RESULT_NONE && eeh_pe_report("error_detected(IO frozen)", pe,
result != PCI_ERS_RESULT_NEED_RESET) eeh_report_error, &result);
result = PCI_ERS_RESULT_NEED_RESET; if ((pe->type & EEH_PE_PHB) &&
result != PCI_ERS_RESULT_NONE &&
result != PCI_ERS_RESULT_NEED_RESET)
result = PCI_ERS_RESULT_NEED_RESET;
}
/* Get the current PCI slot state. This can take a long time, /* Get the current PCI slot state. This can take a long time,
* sometimes over 300 seconds for certain systems. * sometimes over 300 seconds for certain systems.
*/ */
rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); if (result != PCI_ERS_RESULT_DISCONNECT) {
if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
pr_warn("EEH: Permanent failure\n"); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
goto hard_fail; pr_warn("EEH: Permanent failure\n");
result = PCI_ERS_RESULT_DISCONNECT;
}
} }
/* Since rtas may enable MMIO when posting the error log, /* Since rtas may enable MMIO when posting the error log,
* don't post the error log until after all dev drivers * don't post the error log until after all dev drivers
* have been informed. * have been informed.
*/ */
pr_info("EEH: Collect temporary log\n"); if (result != PCI_ERS_RESULT_DISCONNECT) {
eeh_slot_error_detail(pe, EEH_LOG_TEMP); pr_info("EEH: Collect temporary log\n");
eeh_slot_error_detail(pe, EEH_LOG_TEMP);
}
/* If all device drivers were EEH-unaware, then shut /* If all device drivers were EEH-unaware, then shut
* down all of the device drivers, and hope they * down all of the device drivers, and hope they
...@@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (rc) { if (rc) {
pr_warn("%s: Unable to reset, err=%d\n", pr_warn("%s: Unable to reset, err=%d\n",
__func__, rc); __func__, rc);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} }
} }
...@@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Enable I/O for affected devices\n"); pr_info("EEH: Enable I/O for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
if (rc < 0) if (rc < 0) {
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
if (rc) { } else if (rc) {
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
} else { } else {
pr_info("EEH: Notify device drivers to resume I/O\n"); pr_info("EEH: Notify device drivers to resume I/O\n");
...@@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Enabled DMA for affected devices\n"); pr_info("EEH: Enabled DMA for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
if (rc < 0) if (rc < 0) {
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
if (rc) { } else if (rc) {
result = PCI_ERS_RESULT_NEED_RESET; result = PCI_ERS_RESULT_NEED_RESET;
} else { } else {
/* /*
...@@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
} }
} }
/* If any device has a hard failure, then shut off everything. */
if (result == PCI_ERS_RESULT_DISCONNECT) {
pr_warn("EEH: Device driver gave up\n");
goto hard_fail;
}
/* If any device called out for a reset, then reset the slot */ /* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) { if (result == PCI_ERS_RESULT_NEED_RESET) {
pr_info("EEH: Reset without hotplug activity\n"); pr_info("EEH: Reset without hotplug activity\n");
...@@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) ...@@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (rc) { if (rc) {
pr_warn("%s: Cannot reset, err=%d\n", pr_warn("%s: Cannot reset, err=%d\n",
__func__, rc); __func__, rc);
goto hard_fail; result = PCI_ERS_RESULT_DISCONNECT;
} else {
result = PCI_ERS_RESULT_NONE;
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("slot_reset", pe, eeh_report_reset,
&result);
} }
pr_info("EEH: Notify device drivers "
"the completion of reset\n");
result = PCI_ERS_RESULT_NONE;
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("slot_reset", pe, eeh_report_reset, &result);
} }
/* All devices should claim they have recovered by now. */ if ((result == PCI_ERS_RESULT_RECOVERED) ||
if ((result != PCI_ERS_RESULT_RECOVERED) && (result == PCI_ERS_RESULT_NONE)) {
(result != PCI_ERS_RESULT_NONE)) { /*
pr_warn("EEH: Not recovered\n"); * For those hot removed VFs, we should add back them after PF
goto hard_fail; * get recovered properly.
} */
list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
/* rmv_entry) {
* For those hot removed VFs, we should add back them after PF get eeh_add_virt_device(edev);
* recovered properly. list_del(&edev->rmv_entry);
*/
list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
rmv_entry) {
eeh_add_virt_device(edev);
list_del(&edev->rmv_entry);
}
/* Tell all device drivers that they can resume operations */
pr_info("EEH: Notify device driver to resume\n");
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("resume", pe, eeh_report_resume, NULL);
eeh_for_each_pe(pe, tmp_pe) {
eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
edev->mode &= ~EEH_DEV_NO_HANDLER;
edev->in_error = false;
} }
}
pr_info("EEH: Recovery successful.\n"); /* Tell all device drivers that they can resume operations */
goto final; pr_info("EEH: Notify device driver to resume\n");
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("resume", pe, eeh_report_resume, NULL);
eeh_for_each_pe(pe, tmp_pe) {
eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
edev->mode &= ~EEH_DEV_NO_HANDLER;
edev->in_error = false;
}
}
hard_fail: pr_info("EEH: Recovery successful.\n");
/* } else {
* About 90% of all real-life EEH failures in the field /*
* are due to poorly seated PCI cards. Only 10% or so are * About 90% of all real-life EEH failures in the field
* due to actual, failed cards. * are due to poorly seated PCI cards. Only 10% or so are
*/ * due to actual, failed cards.
pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" */
"Please try reseating or replacing it\n", pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
pe->phb->global_number, pe->addr); "Please try reseating or replacing it\n",
pe->phb->global_number, pe->addr);
eeh_slot_error_detail(pe, EEH_LOG_PERM); eeh_slot_error_detail(pe, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */ /* Notify all devices that they're about to go down. */
eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false); eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe, eeh_pe_report("error_detected(permanent failure)", pe,
eeh_report_failure, NULL); eeh_report_failure, NULL);
/* Mark the PE to be removed permanently */ /* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED); eeh_pe_state_mark(pe, EEH_PE_REMOVED);
/* /*
* Shut down the device drivers for good. We mark * Shut down the device drivers for good. We mark
* all removed devices correctly to avoid access * all removed devices correctly to avoid access
* the their PCI config any more. * the their PCI config any more.
*/ */
if (pe->type & EEH_PE_VF) { if (pe->type & EEH_PE_VF) {
eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
} else { } else {
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
pci_lock_rescan_remove(); pci_lock_rescan_remove();
pci_hp_remove_devices(bus); pci_hp_remove_devices(bus);
pci_unlock_rescan_remove(); pci_unlock_rescan_remove();
/* The passed PE should no longer be used */ /* The passed PE should no longer be used */
return; return;
}
} }
final:
eeh_pe_state_clear(pe, EEH_PE_RECOVERING); eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment