perf test: Simplify metric value validation test final report

The original test report was too complicated to read with information that not really useful. This new update simplify the report which should largely improve the readibility. Signed-off-by: Weilin Wang <weilin.wang@intel.com> Reviewed-by: Ian Rogers <irogers@google.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Kan Liang <kan.liang@linux.intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240130180907.639729-1-weilin.wang@intel.com

perf test: Simplify metric value validation test final report
The original test report was too complicated to read with information that not really useful. This new update simplify the report which should largely improve the readibility. Signed-off-by: Weilin Wang <weilin.wang@intel.com> Reviewed-by: Ian Rogers <irogers@google.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Samantha Alt <samantha.alt@intel.com> Cc: Kan Liang <kan.liang@linux.intel.com> Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240130180907.639729-1-weilin.wang@intel.com
8f95b29c · Weilin Wang · Namhyung Kim · 1c84b47f · 8f95b29c · 8f95b29c
Commit 8f95b29c authored Jan 30, 2024 by Weilin Wang Committed by Namhyung Kim Feb 01, 2024
Showing with 127 additions and 108 deletions

tools/perf/tests/shell/lib/perf_metric_validation.py tools/perf/tests/shell/lib/perf_metric_validation.py +124 -107

tools/perf/tests/shell/stat_metrics_values.sh tools/perf/tests/shell/stat_metrics_values.sh +3 -1

No files found.
--- a/tools/perf/tests/shell/lib/perf_metric_validation.py
+++ b/tools/perf/tests/shell/lib/perf_metric_validation.py
-#SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0
 import re
 import csv
 import json
@@ -6,36 +6,61 @@ import argparse
 from pathlib import Path
 import subprocess

+
+class TestError:
+    def __init__(self, metric: list[str], wl: str, value: list[float], low: float, up=float('nan'), description=str()):
+        self.metric: list = metric  # multiple metrics in relationship type tests
+        self.workloads = [wl]  # multiple workloads possible
+        self.collectedValue: list = value
+        self.valueLowBound = low
+        self.valueUpBound = up
+        self.description = description
+
+    def __repr__(self) -> str:
+        if len(self.metric) > 1:
+            return "\nMetric Relationship Error: \tThe collected value of metric {0}\n\
+                \tis {1} in workload(s): {2} \n\
+                \tbut expected value range is [{3}, {4}]\n\
+                \tRelationship rule description: \'{5}\'".format(self.metric, self.collectedValue, self.workloads,
+                                                                 self.valueLowBound, self.valueUpBound, self.description)
+        elif len(self.collectedValue) == 0:
+            return "\nNo Metric Value Error: \tMetric {0} returns with no value \n\
+                    \tworkload(s): {1}".format(self.metric, self.workloads)
+        else:
+            return "\nWrong Metric Value Error: \tThe collected value of metric {0}\n\
+                    \tis {1} in workload(s): {2}\n\
+                    \tbut expected value range is [{3}, {4}]"\
+                        .format(self.metric, self.collectedValue, self.workloads,
+                                self.valueLowBound, self.valueUpBound)
+
+
 class Validator:
    def __init__(self, rulefname, reportfname='', t=5, debug=False, datafname='', fullrulefname='', workload='true', metrics=''):
        self.rulefname = rulefname
        self.reportfname = reportfname
        self.rules = None
-        self.collectlist:str = metrics
+        self.collectlist: str = metrics
        self.metrics = self.__set_metrics(metrics)
        self.skiplist = set()
        self.tolerance = t

        self.workloads = [x for x in workload.split(",") if x]
-        self.wlidx = 0 # idx of current workloads
-        self.allresults = dict() # metric results of all workload
-        self.allignoremetrics = dict() # metrics with no results or negative results
-        self.allfailtests = dict()
+        self.wlidx = 0  # idx of current workloads
+        self.allresults = dict()  # metric results of all workload
        self.alltotalcnt = dict()
        self.allpassedcnt = dict()
-        self.allerrlist = dict()

-        self.results = dict() # metric results of current workload
+        self.results = dict()  # metric results of current workload
        # vars for test pass/failure statistics
-        self.ignoremetrics= set() # metrics with no results or negative results, neg result counts as a failed test
-        self.failtests = dict()
+        # metrics with no results or negative results, neg result counts failed tests
+        self.ignoremetrics = set()
        self.totalcnt = 0
        self.passedcnt = 0
        # vars for errors
        self.errlist = list()

        # vars for Rule Generator
-        self.pctgmetrics = set() # Percentage rule
+        self.pctgmetrics = set()  # Percentage rule

        # vars for debug
        self.datafname = datafname
@@ -69,10 +94,10 @@ class Validator:
                      ensure_ascii=True,
                      indent=4)

-    def get_results(self, idx:int = 0):
+    def get_results(self, idx: int = 0):
        return self.results[idx]

-    def get_bounds(self, lb, ub, error, alias={}, ridx:int = 0) -> list:
+    def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list:
        """
        Get bounds and tolerance from lb, ub, and error.
        If missing lb, use 0.0; missing ub, use float('inf); missing error, use self.tolerance.
@@ -85,7 +110,7 @@ class Validator:
                  tolerance, denormalized base on upper bound value
        """
        # init ubv and lbv to invalid values
-        def get_bound_value (bound, initval, ridx):
+        def get_bound_value(bound, initval, ridx):
            val = initval
            if isinstance(bound, int) or isinstance(bound, float):
                val = bound
@@ -113,10 +138,10 @@ class Validator:

        return lbv, ubv, denormerr

-    def get_value(self, name:str, ridx:int = 0) -> list:
+    def get_value(self, name: str, ridx: int = 0) -> list:
        """
        Get value of the metric from self.results.
-        If result of this metric is not provided, the metric name will be added into self.ignoremetics and self.errlist.
+        If result of this metric is not provided, the metric name will be added into self.ignoremetics.
        All future test(s) on this metric will fail.

        @param name: name of the metric
@@ -142,7 +167,7 @@ class Validator:
        Check if metrics value are non-negative.
        One metric is counted as one test.
        Failure: when metric value is negative or not provided.
-        Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics.
+        Metrics with negative value will be added into self.ignoremetrics.
        """
        negmetric = dict()
        pcnt = 0
@@ -155,25 +180,27 @@ class Validator:
            else:
                pcnt += 1
            tcnt += 1
+        # The first round collect_perf() run these metrics with simple workload
+        # "true". We give metrics a second chance with a longer workload if less
+        # than 20 metrics failed positive test.
        if len(rerun) > 0 and len(rerun) < 20:
            second_results = dict()
            self.second_test(rerun, second_results)
            for name, val in second_results.items():
-                if name not in negmetric: continue
+                if name not in negmetric:
+                    continue
                if val >= 0:
                    del negmetric[name]
                    pcnt += 1

-        self.failtests['PositiveValueTest']['Total Tests'] = tcnt
-        self.failtests['PositiveValueTest']['Passed Tests'] = pcnt
        if len(negmetric.keys()):
            self.ignoremetrics.update(negmetric.keys())
-            negmessage = ["{0}(={1:.4f})".format(name, val) for name, val in negmetric.items()]
-            self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue': negmessage})
+            self.errlist.extend(
+                [TestError([m], self.workloads[self.wlidx], negmetric[m], 0) for m in negmetric.keys()])

        return

-    def evaluate_formula(self, formula:str, alias:dict, ridx:int = 0):
+    def evaluate_formula(self, formula: str, alias: dict, ridx: int = 0):
        """
        Evaluate the value of formula.

@@ -187,10 +214,11 @@ class Validator:
        sign = "+"
        f = str()

-        #TODO: support parenthesis?
+        # TODO: support parenthesis?
        for i in range(len(formula)):
            if i+1 == len(formula) or formula[i] in ('+', '-', '*', '/'):
-                s = alias[formula[b:i]] if i+1 < len(formula) else alias[formula[b:]]
+                s = alias[formula[b:i]] if i + \
+                    1 < len(formula) else alias[formula[b:]]
                v = self.get_value(s, ridx)
                if not v:
                    errs.append(s)
@@ -228,49 +256,49 @@ class Validator:
        alias = dict()
        for m in rule['Metrics']:
            alias[m['Alias']] = m['Name']
-        lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
-        val, f = self.evaluate_formula(rule['Formula'], alias, ridx=rule['RuleIndex'])
+        lbv, ubv, t = self.get_bounds(
+            rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
+        val, f = self.evaluate_formula(
+            rule['Formula'], alias, ridx=rule['RuleIndex'])
+
+        lb = rule['RangeLower']
+        ub = rule['RangeUpper']
+        if isinstance(lb, str):
+            if lb in alias:
+                lb = alias[lb]
+        if isinstance(ub, str):
+            if ub in alias:
+                ub = alias[ub]
+
        if val == -1:
-            self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Description':f})
+            self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [],
+                                lb, ub, rule['Description']))
        elif not self.check_bound(val, lbv, ubv, t):
-            lb = rule['RangeLower']
-            ub = rule['RangeUpper']
-            if isinstance(lb, str):
-                if lb in alias:
-                    lb = alias[lb]
-            if isinstance(ub, str):
-                if ub in alias:
-                    ub = alias[ub]
-            self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Formula':f,
-                                                                       'RangeLower': lb, 'LowerBoundValue': self.get_value(lb),
-                                                                       'RangeUpper': ub, 'UpperBoundValue':self.get_value(ub),
-                                                                       'ErrorThreshold': t, 'CollectedValue': val})
+            self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [val],
+                                lb, ub, rule['Description']))
        else:
            self.passedcnt += 1
-            self.failtests['RelationshipTest']['Passed Tests'] += 1
        self.totalcnt += 1
-        self.failtests['RelationshipTest']['Total Tests'] += 1

        return

-
    # Single Metric Test
-    def single_test(self, rule:dict):
+    def single_test(self, rule: dict):
        """
        Validate if the metrics are in the required value range.
        eg. lower_bound <= metrics_value <= upper_bound
        One metric is counted as one test in this type of test.
        One rule may include one or more metrics.
        Failure: when the metric value not provided or the value is outside the bounds.
-        This test updates self.total_cnt and records failed tests in self.failtest['SingleMetricTest'].
+        This test updates self.total_cnt.

        @param rule: dict with metrics to validate and the value range requirement
        """
-        lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
+        lbv, ubv, t = self.get_bounds(
+            rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
        metrics = rule['Metrics']
        passcnt = 0
        totalcnt = 0
-        faillist = list()
        failures = dict()
        rerun = list()
        for m in metrics:
@@ -286,25 +314,20 @@ class Validator:
            second_results = dict()
            self.second_test(rerun, second_results)
            for name, val in second_results.items():
-                if name not in failures: continue
+                if name not in failures:
+                    continue
                if self.check_bound(val, lbv, ubv, t):
                    passcnt += 1
                    del failures[name]
                else:
-                    failures[name] = val
+                    failures[name] = [val]
                    self.results[0][name] = val

        self.totalcnt += totalcnt
        self.passedcnt += passcnt
-        self.failtests['SingleMetricTest']['Total Tests'] += totalcnt
-        self.failtests['SingleMetricTest']['Passed Tests'] += passcnt
        if len(failures.keys()) != 0:
-            faillist = [{'MetricName':name, 'CollectedValue':val} for name, val in failures.items()]
-            self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'],
-                                                                       'RangeLower': rule['RangeLower'],
-                                                                       'RangeUpper': rule['RangeUpper'],
-                                                                       'ErrorThreshold':rule['ErrorThreshold'],
-                                                                       'Failure':faillist})
+            self.errlist.extend([TestError([name], self.workloads[self.wlidx], val,
+                                rule['RangeLower'], rule['RangeUpper']) for name, val in failures.items()])

        return

@@ -312,19 +335,11 @@ class Validator:
        """
        Create final report and write into a JSON file.
        """
-        alldata = list()
-        for i in range(0, len(self.workloads)):
-            reportstas = {"Total Rule Count": self.alltotalcnt[i], "Passed Rule Count": self.allpassedcnt[i]}
-            data = {"Metric Validation Statistics": reportstas, "Tests in Category": self.allfailtests[i],
-                    "Errors":self.allerrlist[i]}
-            alldata.append({"Workload": self.workloads[i], "Report": data})
-
-        json_str = json.dumps(alldata, indent=4)
-        print("Test validation finished. Final report: ")
-        print(json_str)
+        print(self.errlist)

        if self.debug:
-            allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]} for i in range(0, len(self.workloads))]
+            allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]}
+                      for i in range(0, len(self.workloads))]
            self.json_dump(allres, self.datafname)

    def check_rule(self, testtype, metric_list):
@@ -342,13 +357,13 @@ class Validator:
        return True

    # Start of Collector and Converter
-    def convert(self, data: list, metricvalues:dict):
+    def convert(self, data: list, metricvalues: dict):
        """
        Convert collected metric data from the -j output to dict of {metric_name:value}.
        """
        for json_string in data:
            try:
-                result =json.loads(json_string)
+                result = json.loads(json_string)
                if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "":
                    name = result["metric-unit"].split("  ")[1] if len(result["metric-unit"].split("  ")) > 1 \
                        else result["metric-unit"]
@@ -365,9 +380,10 @@ class Validator:
        print(" ".join(command))
        cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8')
        data = [x+'}' for x in cmd.stderr.split('}\n') if x]
+        if data[0][0] != '{':
+            data[0] = data[0][data[0].find('{'):]
        return data

-
    def collect_perf(self, workload: str):
        """
        Collect metric data with "perf stat -M" on given workload with -a and -j.
@@ -385,14 +401,18 @@ class Validator:
            if rule["TestType"] == "RelationshipTest":
                metrics = [m["Name"] for m in rule["Metrics"]]
                if not any(m not in collectlist[0] for m in metrics):
-                    collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))]
+                    collectlist[rule["RuleIndex"]] = [
+                        ",".join(list(set(metrics)))]

        for idx, metrics in collectlist.items():
-            if idx == 0: wl = "true"
-            else: wl = workload
+            if idx == 0:
+                wl = "true"
+            else:
+                wl = workload
            for metric in metrics:
                data = self._run_perf(metric, wl)
-                if idx not in self.results: self.results[idx] = dict()
+                if idx not in self.results:
+                    self.results[idx] = dict()
                self.convert(data, self.results[idx])
        return

@@ -412,7 +432,8 @@ class Validator:
        2) create metric name list
        """
        command = ['perf', 'list', '-j', '--details', 'metrics']
-        cmd = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
+        cmd = subprocess.run(command, stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE, encoding='utf-8')
        try:
            data = json.loads(cmd.stdout)
            for m in data:
@@ -453,12 +474,12 @@ class Validator:
        rules = data['RelationshipRules']
        self.skiplist = set([name.lower() for name in data['SkipList']])
        self.rules = self.remove_unsupported_rules(rules)
-        pctgrule = {'RuleIndex':0,
-                    'TestType':'SingleMetricTest',
-                    'RangeLower':'0',
+        pctgrule = {'RuleIndex': 0,
+                    'TestType': 'SingleMetricTest',
+                    'RangeLower': '0',
                    'RangeUpper': '100',
                    'ErrorThreshold': self.tolerance,
-                    'Description':'Metrics in percent unit have value with in [0, 100]',
+                    'Description': 'Metrics in percent unit have value with in [0, 100]',
                    'Metrics': [{'Name': m.lower()} for m in self.pctgmetrics]}
        self.rules.append(pctgrule)

@@ -469,8 +490,9 @@ class Validator:
            idx += 1

        if self.debug:
-            #TODO: need to test and generate file name correctly
-            data = {'RelationshipRules':self.rules, 'SupportedMetrics': [{"MetricName": name} for name in self.metrics]}
+            # TODO: need to test and generate file name correctly
+            data = {'RelationshipRules': self.rules, 'SupportedMetrics': [
+                {"MetricName": name} for name in self.metrics]}
            self.json_dump(data, self.fullrulefname)

        return
@@ -482,20 +504,17 @@ class Validator:
        @param key: key to the dictionaries (index of self.workloads).
        '''
        self.allresults[key] = self.results
-        self.allignoremetrics[key] = self.ignoremetrics
-        self.allfailtests[key] = self.failtests
        self.alltotalcnt[key] = self.totalcnt
        self.allpassedcnt[key] = self.passedcnt
-        self.allerrlist[key] = self.errlist

-    #Initialize data structures before data validation of each workload
+    # Initialize data structures before data validation of each workload
    def _init_data(self):

-        testtypes = ['PositiveValueTest', 'RelationshipTest', 'SingleMetricTest']
+        testtypes = ['PositiveValueTest',
+                     'RelationshipTest', 'SingleMetricTest']
        self.results = dict()
-        self.ignoremetrics= set()
+        self.ignoremetrics = set()
        self.errlist = list()
-        self.failtests = {k:{'Total Tests':0, 'Passed Tests':0, 'Failed Tests':[]} for k in testtypes}
        self.totalcnt = 0
        self.passedcnt = 0

@@ -525,32 +544,33 @@ class Validator:
                testtype = r['TestType']
                if not self.check_rule(testtype, r['Metrics']):
                    continue
-                if  testtype == 'RelationshipTest':
+                if testtype == 'RelationshipTest':
                    self.relationship_test(r)
                elif testtype == 'SingleMetricTest':
                    self.single_test(r)
                else:
                    print("Unsupported Test Type: ", testtype)
-                    self.errlist.append("Unsupported Test Type from rule: " + r['RuleIndex'])
-            self._storewldata(i)
            print("Workload: ", self.workloads[i])
-            print("Total metrics collected: ", self.failtests['PositiveValueTest']['Total Tests'])
-            print("Non-negative metric count: ", self.failtests['PositiveValueTest']['Passed Tests'])
            print("Total Test Count: ", self.totalcnt)
            print("Passed Test Count: ", self.passedcnt)
-
+            self._storewldata(i)
        self.create_report()
-        return sum(self.alltotalcnt.values()) != sum(self.allpassedcnt.values())
+        return len(self.errlist) > 0
 # End of Class Validator


 def main() -> None:
-    parser = argparse.ArgumentParser(description="Launch metric value validation")
-
-    parser.add_argument("-rule", help="Base validation rule file", required=True)
-    parser.add_argument("-output_dir", help="Path for validator output file, report file", required=True)
-    parser.add_argument("-debug", help="Debug run, save intermediate data to files", action="store_true", default=False)
-    parser.add_argument("-wl", help="Workload to run while data collection", default="true")
+    parser = argparse.ArgumentParser(
+        description="Launch metric value validation")
+
+    parser.add_argument(
+        "-rule", help="Base validation rule file", required=True)
+    parser.add_argument(
+        "-output_dir", help="Path for validator output file, report file", required=True)
+    parser.add_argument("-debug", help="Debug run, save intermediate data to files",
+                        action="store_true", default=False)
+    parser.add_argument(
+        "-wl", help="Workload to run while data collection", default="true")
    parser.add_argument("-m", help="Metric list to validate", default="")
    args = parser.parse_args()
    outpath = Path(args.output_dir)
@@ -559,8 +579,8 @@ def main() -> None:
    datafile = Path.joinpath(outpath, 'perf_data.json')

    validator = Validator(args.rule, reportf, debug=args.debug,
-                        datafname=datafile, fullrulefname=fullrule, workload=args.wl,
-                        metrics=args.m)
+                          datafname=datafile, fullrulefname=fullrule, workload=args.wl,
+                          metrics=args.m)
    ret = validator.test()

    return ret
@@ -569,6 +589,3 @@ def main() -> None:
 if __name__ == "__main__":
    import sys
    sys.exit(main())
-
-
-
--- a/tools/perf/tests/shell/stat_metrics_values.sh
+++ b/tools/perf/tests/shell/stat_metrics_values.sh
@@ -19,6 +19,8 @@ echo "Output will be stored in: $tmpdir"
 $PYTHON $pythonvalidator -rule $rulefile -output_dir $tmpdir -wl "${workload}"
 ret=$?
 rm -rf $tmpdir
-
+if [ $ret -ne 0 ]; then
+	echo "Metric validation return with erros. Please check metrics reported with errors."
+fi
 exit $ret