Commit 5a71978e authored by Gavin Shan's avatar Gavin Shan Committed by Benjamin Herrenschmidt

powerpc/eeh: Trace time on first error for PE

We're not expecting that one specific PE got frozen for over 5
times in last hour. Otherwise, the PE will be removed from the
system upon newly coming EEH errors. The patch introduces time
stamp to trace the first error on specific PE in last hour and
function to update that accordingly. Besides, the time stamp
is recovered during PE hotplug path as we did for frozen count.
Signed-off-by: default avatarGavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent c8608558
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/time.h>
struct pci_dev; struct pci_dev;
struct pci_bus; struct pci_bus;
...@@ -62,6 +63,7 @@ struct eeh_pe { ...@@ -62,6 +63,7 @@ struct eeh_pe {
struct pci_bus *bus; /* Top PCI bus for bus PE */ struct pci_bus *bus; /* Top PCI bus for bus PE */
int check_count; /* Times of ignored error */ int check_count; /* Times of ignored error */
int freeze_count; /* Times of froze up */ int freeze_count; /* Times of froze up */
struct timeval tstamp; /* Time on first-time freeze */
int false_positives; /* Times of reported #ff's */ int false_positives; /* Times of reported #ff's */
struct eeh_pe *parent; /* Parent PE */ struct eeh_pe *parent; /* Parent PE */
struct list_head child_list; /* Link PE to the child list */ struct list_head child_list; /* Link PE to the child list */
...@@ -190,6 +192,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); ...@@ -190,6 +192,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
int eeh_add_to_parent_pe(struct eeh_dev *edev); int eeh_add_to_parent_pe(struct eeh_dev *edev);
int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
void eeh_pe_update_time_stamp(struct eeh_pe *pe);
void *eeh_pe_dev_traverse(struct eeh_pe *root, void *eeh_pe_dev_traverse(struct eeh_pe *root,
eeh_traverse_func fn, void *flag); eeh_traverse_func fn, void *flag);
void eeh_pe_restore_bars(struct eeh_pe *pe); void eeh_pe_restore_bars(struct eeh_pe *pe);
......
...@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata) ...@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
*/ */
static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
{ {
struct timeval tstamp;
int cnt, rc; int cnt, rc;
/* pcibios will clear the counter; save the value */ /* pcibios will clear the counter; save the value */
cnt = pe->freeze_count; cnt = pe->freeze_count;
tstamp = pe->tstamp;
/* /*
* We don't remove the corresponding PE instances because * We don't remove the corresponding PE instances because
...@@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) ...@@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
ssleep(5); ssleep(5);
pcibios_add_pci_devices(bus); pcibios_add_pci_devices(bus);
} }
pe->tstamp = tstamp;
pe->freeze_count = cnt; pe->freeze_count = cnt;
return 0; return 0;
...@@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe) ...@@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe)
return; return;
} }
eeh_pe_update_time_stamp(pe);
pe->freeze_count++; pe->freeze_count++;
if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
goto excess_failures; goto excess_failures;
......
...@@ -481,6 +481,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) ...@@ -481,6 +481,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
return 0; return 0;
} }
/**
* eeh_pe_update_time_stamp - Update PE's frozen time stamp
* @pe: EEH PE
*
* We have time stamp for each PE to trace its time of getting
* frozen in last hour. The function should be called to update
* the time stamp on first error of the specific PE. On the other
* handle, we needn't account for errors happened in last hour.
*/
void eeh_pe_update_time_stamp(struct eeh_pe *pe)
{
struct timeval tstamp;
if (!pe) return;
if (pe->freeze_count <= 0) {
pe->freeze_count = 0;
do_gettimeofday(&pe->tstamp);
} else {
do_gettimeofday(&tstamp);
if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
pe->tstamp = tstamp;
pe->freeze_count = 0;
}
}
}
/** /**
* __eeh_pe_state_mark - Mark the state for the PE * __eeh_pe_state_mark - Mark the state for the PE
* @data: EEH PE * @data: EEH PE
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment