Add parsing stats.

ddb41afa · Vincent Pelletier · 7b7271f9 · ddb41afa
Commit ddb41afa authored Apr 03, 2013 by Vincent Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 1 deletion

apachedex/__init__.py apachedex/__init__.py +30 -1

No files found.
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -42,6 +42,7 @@ import gzip
 import os
 import re
 import sys
+import time

 try:
  import matplotlib
@@ -352,6 +353,8 @@ def main():
      help='Periodicity of sampling buckets. Default: %(default)r')
  parser.add_argument('-q', '--quiet', action='store_true',
    help='Suppress warnings about malformed lines.')
+  parser.add_argument('-s', '--stats', action='store_true',
+    help='Enable parsing stats (time taken to parse all files, ...)')
  args = parser.parse_args()
  line_regex = ''
  try:
@@ -388,6 +391,11 @@ def main():
  file_count = len(infile_list)
  per_site = {}
  hit_per_day = defaultdict(int)
+  malformed_lines = 0
+  skipped_lines = 0
+  no_url_lines = 0
+  all_lines = 0
+  start_time = time.time()
  for fileno, filename in enumerate(infile_list, 1):
    print >> sys.stderr, 'Processing %s [%i/%i]' % (
      filename, fileno, file_count)
@@ -407,9 +415,11 @@ def main():
        if not quiet:
          print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
            filename, lineno, line)
+        malformed_lines += 1
        continue
      url_match = matchrequest(match.group('request'))
      if url_match is None:
+        no_url_lines += 1
        continue
      url = url_match.group('url')
      if url.startswith('http'):
@@ -422,6 +432,7 @@ def main():
        site = default_site
        action = default_action
      if action is None:
+        skipped_lines += 1
        continue
      utcdate = asDate(match.group('timestamp'))
      hit_per_day[utcdate] += 1
@@ -430,9 +441,11 @@ def main():
      except KeyError:
        site_data = per_site[site] = action(threshold)
      site_data.accumulate(match, url_match, utcdate)
+    all_lines += lineno
+  end_parsing_time = time.time()
  os.chdir(args.out)
  with open('index.html', 'w') as out:
-    out.write('<html><head><title>Stats</title><style>th, td { border: solid 1px #000; } th { text-align: center; } td { text-align: right; } td.text { text-align: left; } table { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } </style></head><body><h1>Overall</h1><h2>Hits per day</h2><table><tr><th>date</th><th>hits</th></tr>')
+    out.write('<html><head><title>Stats</title><style>th, td { border: solid 1px #000; } th { text-align: center; } td { text-align: right; } th.text, td.text { text-align: left; } table { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } </style></head><body><h1>Overall</h1><h2>Hits per day</h2><table><tr><th>date</th><th>hits</th></tr>')
    for date, hit in sorted(hit_per_day.iteritems(), key=ITEMGETTER0):
      out.write('<tr><td>%s</td><td>%s</td></tr>' % (date, hit))
    out.write('</table>')
@@ -473,6 +486,22 @@ def main():

        out.write('<img src="' + plot_filename + '" />')
      out.write(data.asHTML())
+    end_stat_time = time.time()
+    if args.stats:
+      out.write('<h1>Parsing stats</h1><table>')
+      for caption, value in (
+            ('Execution date', datetime.now().isoformat()),
+            ('File count', file_count),
+            ('Lines', all_lines),
+            ('... malformed', malformed_lines),
+            ('... URL-less', no_url_lines),
+            ('... skipped', skipped_lines),
+            ('Parsing time', '%.2fs' % (end_parsing_time - start_time)),
+            ('Rendering time', '%.2fs' % (end_stat_time - end_parsing_time)),
+          ):
+        out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % (
+          caption, value))
+      out.write('</table>')
    out.write('</body></html>')

 if __name__ == '__main__':