Commit 5c7a35e3 authored by Gavin Shan's avatar Gavin Shan Committed by Benjamin Herrenschmidt

powerpc/powernv: Fix killed EEH event

On PowerNV platform, EEH errors are reported by IO accessors or poller
driven by interrupt. After the PE is isolated, we won't produce EEH
event for the PE. The current implementation has possibility of EEH
event lost in this way:

The interrupt handler queues one "special" event, which drives the poller.
EEH thread doesn't pick the special event yet. IO accessors kicks in, the
frozen PE is marked as "isolated" and EEH event is queued to the list.
EEH thread runs because of special event and purge all existing EEH events.
However, we never produce an other EEH event for the frozen PE. Eventually,
the PE is marked as "isolated" and we don't have EEH event to recover it.

The patch fixes the issue to keep EEH events for PEs that have been
marked as "isolated" with the help of additional "force" help to
eeh_remove_event().
Reported-by: default avatarRolf Brudeseth <rolfb@us.ibm.com>
Signed-off-by: default avatarGavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 6e0fdf9a
...@@ -33,7 +33,7 @@ struct eeh_event { ...@@ -33,7 +33,7 @@ struct eeh_event {
int eeh_event_init(void); int eeh_event_init(void);
int eeh_send_failure_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe);
void eeh_remove_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force);
void eeh_handle_event(struct eeh_pe *pe); void eeh_handle_event(struct eeh_pe *pe);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
......
...@@ -770,7 +770,7 @@ static void eeh_handle_special_event(void) ...@@ -770,7 +770,7 @@ static void eeh_handle_special_event(void)
eeh_serialize_lock(&flags); eeh_serialize_lock(&flags);
/* Purge all events */ /* Purge all events */
eeh_remove_event(NULL); eeh_remove_event(NULL, true);
list_for_each_entry(hose, &hose_list, list_node) { list_for_each_entry(hose, &hose_list, list_node) {
phb_pe = eeh_phb_pe_get(hose); phb_pe = eeh_phb_pe_get(hose);
...@@ -789,7 +789,7 @@ static void eeh_handle_special_event(void) ...@@ -789,7 +789,7 @@ static void eeh_handle_special_event(void)
eeh_serialize_lock(&flags); eeh_serialize_lock(&flags);
/* Purge all events of the PHB */ /* Purge all events of the PHB */
eeh_remove_event(pe); eeh_remove_event(pe, true);
if (rc == EEH_NEXT_ERR_DEAD_PHB) if (rc == EEH_NEXT_ERR_DEAD_PHB)
eeh_pe_state_mark(pe, EEH_PE_ISOLATED); eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
......
...@@ -152,24 +152,33 @@ int eeh_send_failure_event(struct eeh_pe *pe) ...@@ -152,24 +152,33 @@ int eeh_send_failure_event(struct eeh_pe *pe)
/** /**
* eeh_remove_event - Remove EEH event from the queue * eeh_remove_event - Remove EEH event from the queue
* @pe: Event binding to the PE * @pe: Event binding to the PE
* @force: Event will be removed unconditionally
* *
* On PowerNV platform, we might have subsequent coming events * On PowerNV platform, we might have subsequent coming events
* is part of the former one. For that case, those subsequent * is part of the former one. For that case, those subsequent
* coming events are totally duplicated and unnecessary, thus * coming events are totally duplicated and unnecessary, thus
* they should be removed. * they should be removed.
*/ */
void eeh_remove_event(struct eeh_pe *pe) void eeh_remove_event(struct eeh_pe *pe, bool force)
{ {
unsigned long flags; unsigned long flags;
struct eeh_event *event, *tmp; struct eeh_event *event, *tmp;
/*
* If we have NULL PE passed in, we have dead IOC
* or we're sure we can report all existing errors
* by the caller.
*
* With "force", the event with associated PE that
* have been isolated, the event won't be removed
* to avoid event lost.
*/
spin_lock_irqsave(&eeh_eventlist_lock, flags); spin_lock_irqsave(&eeh_eventlist_lock, flags);
list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
/* if (!force && event->pe &&
* If we don't have valid PE passed in, that means (event->pe->state & EEH_PE_ISOLATED))
* we already have event corresponding to dead IOC continue;
* and all events should be purged.
*/
if (!pe) { if (!pe) {
list_del(&event->list); list_del(&event->list);
kfree(event); kfree(event);
......
...@@ -717,7 +717,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) ...@@ -717,7 +717,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
* And we should keep the cached OPAL notifier event sychronized * And we should keep the cached OPAL notifier event sychronized
* between the kernel and firmware. * between the kernel and firmware.
*/ */
eeh_remove_event(NULL); eeh_remove_event(NULL, false);
opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
list_for_each_entry(hose, &hose_list, list_node) { list_for_each_entry(hose, &hose_list, list_node) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment