Improve best-case line parsing performance.

Support for escaped quotes in strings caused bad performance, so create two regexes: a cheap & brittle one, and an expensive & comprehensive one. +20% parsing speed !

Improve best-case line parsing performance.
Support for escaped quotes in strings caused bad performance, so create two regexes: a cheap & brittle one, and an expensive & comprehensive one. +20% parsing speed !
a1d86d67 · Vincent Pelletier · 701f0b2b · a1d86d67
Commit a1d86d67 authored Apr 08, 2013 by Vincent Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 9 deletions

apachedex/__init__.py apachedex/__init__.py +22 -9

No files found.
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -634,17 +634,24 @@ logformat_dict = {
  '%l': r'(?P<ident>[^ ]*)',
  '%u': r'(?P<user>[^ ]*)',
  '%t': r'\[(?P<timestamp>[^\]]*)\]',
-  '%r': r'(?P<request>(\\.|[^\\"])*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
+  '%r': r'(?P<request>[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
  '%>s': r'(?P<status>[0-9]*?)',
  '%O': r'(?P<size>[0-9-]*?)',
-  '%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)', # XXX: expected to be enclosed in "
+  '%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
-  '%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)', # XXX: expected to be enclosed in "
+  '%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in "
  DURATION_US_FORMAT: r'(?P<duration>[0-9]*)',
  DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)',
  '%%': r'%',
  # TODO: add more formats
 }
+# Expensive, but more robust, variants
+expensive_logformat_dict = {
+  '%r': r'(?P<request>(\\.|[^\\"])*)',
+  '%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)',
+  '%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)',
+}
 REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
  '( (?P<protocol>.*))?')
@@ -962,11 +969,12 @@ def main():
      'cannot be computed.'
    sys.exit(1)
  line_regex = ''
+  expensive_line_regex = ''
  try:
    n = iter(args.logformat).next
    while True:
      key = None
-      char = n()
+      expensive_char = char = n()
      if char == '%':
        fmt = n()
        key = char + fmt
@@ -978,10 +986,13 @@ def main():
        elif fmt == '>':
          key += n()
        char = logformat_dict[key]
+        expensive_char = expensive_logformat_dict.get(key, char)
      line_regex += char
+      expensive_line_regex += expensive_char
  except StopIteration:
    assert not key, key
  matchline = re.compile(line_regex).match
+  expensive_matchline = re.compile(expensive_line_regex).match
  matchrequest = REQUEST_PATTERN.match
  if args.period is None:
    next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
@@ -1069,11 +1080,13 @@ def main():
        sys.stderr.flush()
      match = matchline(line)
      if match is None:
-        if not quiet:
+        match = expensive_matchline(line)
-          print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
+        if match is None:
-            filename, lineno, line)
+          if not quiet:
-        malformed_lines += 1
+            print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
-        continue
+              filename, lineno, line)
+          malformed_lines += 1
+          continue
      if match.group('agent') in skip_user_agent:
        skipped_user_agent += 1
        continue