Implement raw data output.

552d017e · Vincent Pelletier · c8ace6fb · 552d017e · 552d017e · 552d017e
Commit 552d017e authored Apr 07, 2013 by Vincent Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 126 additions and 4 deletions

README README +23 -0

TODO TODO +0 -1

apachedex/__init__.py apachedex/__init__.py +103 -3

No files found.
--- a/README
+++ b/README
@@ -113,6 +113,21 @@ A mix of both above examples. Order matters !::
  apachedex --skip-base "/site1/ignored(/|$|\?)"
  --base "/site1(/|$|\?)" "/site2(/|$|\?)"

+Saving the result of an analysis for faster reuse::
+
+  apachedex --default foo --format json --out save_state.json access.log
+
+Continuing a saved analysis, updating collected data::
+
+  apachedex --default foo --format json --state-file save_state.json
+  --out save_state.json access.2.log
+
+Generating HTML output from two state files, aggregating their content
+without parsing more logs::
+
+  apachedex --default foo --state-file save_state.json save_state.2.json
+  --out index.html
+
 Notes
 =====

@@ -120,3 +135,11 @@ When there are no hits for more than a graph period, placeholders are
 generated for 0 hit (which is the reality) and 100% apdex (this is
 arbitrary). Those placeholders only affect graphs, and do not affect
 averages nor table content.
+
+Loading saved states generated with different sets of parameters is not
+prevented, but can produce nonsense/unreadable results. Or it can save the day
+if you do want to mix different parameters (ex: you have some logs generated
+with %T, others with %D).
+
+It is unclear how stable saved state format will evolve. Be prepared to have
+to regenerate saved states if you upgrade APacheDEX.
--- a/TODO
+++ b/TODO
 - use some templating system instead of hardcoded html strings
- provide some form of raw data output, not just html
 - allow user to specify min & max dates
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -244,6 +244,20 @@ class APDEXStats(object):
      'extra_right_class': extra_right_class,
    }

+  @classmethod
+  def fromJSONState(cls, state, getDuration):
+    result = cls(0, getDuration)
+    result.__dict__.update(state)
+    return result
+
+  def asJSONState(self):
+    result = self.__dict__.copy()
+    del result['getDuration']
+    return result
+
+_APDEXDateDictAsJSONState = lambda date_dict: dict(((y, z.asJSONState())
+  for y, z in date_dict.iteritems()))
+
 class GenericSiteStats(object):
  def __init__(self, threshold, getDuration, suffix, error_detail=False):
    self.threshold = threshold
@@ -370,6 +384,27 @@ class GenericSiteStats(object):
      append('</table>')
    return '\n'.join(result)

+  @classmethod
+  def fromJSONState(cls, state, getDuration, suffix):
+    error_detail = state['error_detail']
+    result = cls(state['threshold'], getDuration, suffix, error_detail)
+    if error_detail:
+      result.error_url_count.update(state['error_url_count'])
+    for attribute_id in ('url_apdex', 'apdex'):
+      attribute = getattr(result, attribute_id)
+      for key, apdex_state in state[attribute_id].iteritems():
+        attribute[key] = APDEXStats.fromJSONState(apdex_state, getDuration)
+    return result
+
+  def asJSONState(self):
+    return {
+      'threshold': self.threshold,
+      'error_detail': self.error_detail,
+      'error_url_count': getattr(self, 'error_url_count', None),
+      'url_apdex': _APDEXDateDictAsJSONState(self.url_apdex),
+      'apdex': _APDEXDateDictAsJSONState(self.apdex),
+    }
+
 class ERP5SiteStats(GenericSiteStats):
  """
  Heuristic used:
@@ -497,6 +532,30 @@ class ERP5SiteStats(GenericSiteStats):
      placeholder_delta, graph_period, encoding, stat_filter=stat_filter))
    return '\n'.join(result)

+  @classmethod
+  def fromJSONState(cls, state, getDuration, suffix):
+    result = super(ERP5SiteStats, cls).fromJSONState(state, getDuration, suffix)
+    for module_id, module_dict_state in state['module'].iteritems():
+      module_dict = result.module[module_id]
+      for is_document, date_dict_state in module_dict_state.iteritems():
+        date_dict = module_dict[is_document]
+        for date, apdex_state in date_dict_state.iteritems():
+          date_dict[date] = APDEXStats.fromJSONState(apdex_state, getDuration)
+    no_module = result.no_module
+    for date, apdex_state in state['no_module'].iteritems():
+      no_module[date] = APDEXStats.fromJSONState(apdex_state, getDuration)
+    return result
+
+  def asJSONState(self):
+    result = super(ERP5SiteStats, self).asJSONState()
+    result['module'] = module = {}
+    for module_id, module_dict in self.module.iteritems():
+      module_dict_state = module[module_id] = {}
+      for is_document, date_dict in module_dict.iteritems():
+        module_dict_state[is_document] = _APDEXDateDictAsJSONState(date_dict)
+    result['no_module'] = _APDEXDateDictAsJSONState(self.no_module)
+    return result
+
 DURATION_US_FORMAT = '%D'
 DURATION_S_FORMAT = '%T'

@@ -732,15 +791,31 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
    out.write('</table>')
  out.write('</body></html>')

+def asJSON(out, encoding, per_site, *_):
+  json.dump([(x, y.asJSONState()) for x, y in per_site.iteritems()], out,
+    encoding='ascii')
+
 format_generator = {
  'html': (asHTML, 'utf-8'),
+  'json': (asJSON, 'ascii'),
 }

+# XXX: monkey-patching json module to emit strings instead of unicode objects.
+# Because strings are faster, (30% overall performance hit moving to unicode
+# objects), and only ASCII is expected (urlencoded is ASCII).
+# Subclassing JSONDecoder is not enough as object parser uses scanstring
+# directly.
+original_scanstring = json.decoder.scanstring
+def _scanstring(*args, **kw):
+  string, end = original_scanstring(*args, **kw)
+  return string.encode('ascii'), end
+json.decoder.scanstring = _scanstring
+
 def main():
  global abs_file_container
  parser = argparse.ArgumentParser(description='Compute Apdex out of '
    'apache-style log files')
-  parser.add_argument('logfile', nargs='+',
+  parser.add_argument('logfile', nargs='*',
    help='Log files to process')
  parser.add_argument('-l', '--logformat',
    default='%h %l %u %t "%r" %>s %O "%{Referer}i" "%{User-Agent}i" %D',
@@ -750,6 +825,10 @@ def main():
    help='Filename to write output to. Use - for stdout. Default: %(default)s')
  parser.add_argument('-q', '--quiet', action='store_true',
    help='Suppress warnings about malformed lines.')
+  parser.add_argument('--state-file', nargs='+', default=[], type=file,
+    help='Use given JSON files as initial state. Mixing files generated with '
+      'different parameters is allowed, but no correction is made. Output may '
+      'be unusable (ex: different --apdex, different --period, ...).')

  group = parser.add_argument_group('generated content')
  group.add_argument('-a', '--apdex', default=1.0, type=float,
@@ -757,6 +836,8 @@ def main():
      'Default: %(default).2fs')
  group.add_argument('-e', '--error-detail', action='store_true',
    help='Include detailed report (url & referers) for error statuses.')
+  group.add_argument('-f', '--format', choices=format_generator,
+    default='html', help='Format in which output should be generated.')
  group.add_argument('-p', '--period', choices=period_parser,
      help='Periodicity of sampling buckets. Default: (decide from data). '
      'Performance note: leaving out this parameter reduces parsing '
@@ -770,9 +851,10 @@ def main():
    # Force embedding when file container is unknown (ex: pkg_resources).
    # XXX: allow when --js is also provided ?
    group.add_argument('--js', default=abs_file_container,
-      help='Folder containing needed js files. Default: %(default)s')
+      help='Folder containing needed js files when format is "html". '
+        'Default: %(default)s')
    group.add_argument('--js-embed', action='store_true',
-      help='Embed js files instead of linking to them.')
+      help='Embed js files instead of linking to them when format is "html".')

  group = parser.add_argument_group('site matching', 'Earlier arguments take '
    'precedence. For example: --skip-base "/foo/bar(/|$|\\?)" '
@@ -864,6 +946,24 @@ def main():
  error_detail = args.error_detail
  file_count = len(infile_list)
  per_site = {}
+  for state_file in args.state_file:
+    state = json.load(state_file, encoding='ascii')
+    for url, site_state in state:
+      if url is None:
+        site = None
+        action = default_action
+      else:
+        for site, prefix_match, action in site_list:
+          if site == url:
+            break
+        else:
+          site = None
+          action = default_action
+      if action is None:
+        print >> sys.stderr, 'Info: no prefix match %r, stats skipped' % url
+        continue
+      per_site[site] = action.func.fromJSONState(site_state,
+        getDuration, action.keywords['suffix'])
  skip_user_agent = list(itertools.chain(*args.skip_user_agent))
  malformed_lines = 0
  skipped_lines = 0