Commit a1d86d67 authored by Vincent Pelletier's avatar Vincent Pelletier

Improve best-case line parsing performance.

Support for escaped quotes in strings caused bad performance, so create
two regexes: a cheap & brittle one, and an expensive & comprehensive one.
+20% parsing speed !
parent 701f0b2b
...@@ -634,17 +634,24 @@ logformat_dict = { ...@@ -634,17 +634,24 @@ logformat_dict = {
'%l': r'(?P<ident>[^ ]*)', '%l': r'(?P<ident>[^ ]*)',
'%u': r'(?P<user>[^ ]*)', '%u': r'(?P<user>[^ ]*)',
'%t': r'\[(?P<timestamp>[^\]]*)\]', '%t': r'\[(?P<timestamp>[^\]]*)\]',
'%r': r'(?P<request>(\\.|[^\\"])*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN '%r': r'(?P<request>[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%>s': r'(?P<status>[0-9]*?)', '%>s': r'(?P<status>[0-9]*?)',
'%O': r'(?P<size>[0-9-]*?)', '%O': r'(?P<size>[0-9-]*?)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)', # XXX: expected to be enclosed in " '%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)', # XXX: expected to be enclosed in " '%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in "
DURATION_US_FORMAT: r'(?P<duration>[0-9]*)', DURATION_US_FORMAT: r'(?P<duration>[0-9]*)',
DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)', DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)',
'%%': r'%', '%%': r'%',
# TODO: add more formats # TODO: add more formats
} }
# Expensive, but more robust, variants
expensive_logformat_dict = {
'%r': r'(?P<request>(\\.|[^\\"])*)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)',
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)',
}
REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)' REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
'( (?P<protocol>.*))?') '( (?P<protocol>.*))?')
...@@ -962,11 +969,12 @@ def main(): ...@@ -962,11 +969,12 @@ def main():
'cannot be computed.' 'cannot be computed.'
sys.exit(1) sys.exit(1)
line_regex = '' line_regex = ''
expensive_line_regex = ''
try: try:
n = iter(args.logformat).next n = iter(args.logformat).next
while True: while True:
key = None key = None
char = n() expensive_char = char = n()
if char == '%': if char == '%':
fmt = n() fmt = n()
key = char + fmt key = char + fmt
...@@ -978,10 +986,13 @@ def main(): ...@@ -978,10 +986,13 @@ def main():
elif fmt == '>': elif fmt == '>':
key += n() key += n()
char = logformat_dict[key] char = logformat_dict[key]
expensive_char = expensive_logformat_dict.get(key, char)
line_regex += char line_regex += char
expensive_line_regex += expensive_char
except StopIteration: except StopIteration:
assert not key, key assert not key, key
matchline = re.compile(line_regex).match matchline = re.compile(line_regex).match
expensive_matchline = re.compile(expensive_line_regex).match
matchrequest = REQUEST_PATTERN.match matchrequest = REQUEST_PATTERN.match
if args.period is None: if args.period is None:
next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
...@@ -1069,11 +1080,13 @@ def main(): ...@@ -1069,11 +1080,13 @@ def main():
sys.stderr.flush() sys.stderr.flush()
match = matchline(line) match = matchline(line)
if match is None: if match is None:
if not quiet: match = expensive_matchline(line)
print >> sys.stderr, 'Malformed line at %s:%i: %r' % ( if match is None:
filename, lineno, line) if not quiet:
malformed_lines += 1 print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
continue filename, lineno, line)
malformed_lines += 1
continue
if match.group('agent') in skip_user_agent: if match.group('agent') in skip_user_agent:
skipped_user_agent += 1 skipped_user_agent += 1
continue continue
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment