Commit a1d86d67 authored by Vincent Pelletier's avatar Vincent Pelletier

Improve best-case line parsing performance.

Support for escaped quotes in strings caused bad performance, so create
two regexes: a cheap & brittle one, and an expensive & comprehensive one.
+20% parsing speed !
parent 701f0b2b
......@@ -634,17 +634,24 @@ logformat_dict = {
'%l': r'(?P<ident>[^ ]*)',
'%u': r'(?P<user>[^ ]*)',
'%t': r'\[(?P<timestamp>[^\]]*)\]',
'%r': r'(?P<request>(\\.|[^\\"])*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%r': r'(?P<request>[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%>s': r'(?P<status>[0-9]*?)',
'%O': r'(?P<size>[0-9-]*?)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)', # XXX: expected to be enclosed in "
'%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in "
DURATION_US_FORMAT: r'(?P<duration>[0-9]*)',
DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)',
'%%': r'%',
# TODO: add more formats
}
# Expensive, but more robust, variants
expensive_logformat_dict = {
'%r': r'(?P<request>(\\.|[^\\"])*)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)',
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)',
}
REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
'( (?P<protocol>.*))?')
......@@ -962,11 +969,12 @@ def main():
'cannot be computed.'
sys.exit(1)
line_regex = ''
expensive_line_regex = ''
try:
n = iter(args.logformat).next
while True:
key = None
char = n()
expensive_char = char = n()
if char == '%':
fmt = n()
key = char + fmt
......@@ -978,10 +986,13 @@ def main():
elif fmt == '>':
key += n()
char = logformat_dict[key]
expensive_char = expensive_logformat_dict.get(key, char)
line_regex += char
expensive_line_regex += expensive_char
except StopIteration:
assert not key, key
matchline = re.compile(line_regex).match
expensive_matchline = re.compile(expensive_line_regex).match
matchrequest = REQUEST_PATTERN.match
if args.period is None:
next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
......@@ -1069,11 +1080,13 @@ def main():
sys.stderr.flush()
match = matchline(line)
if match is None:
if not quiet:
print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
filename, lineno, line)
malformed_lines += 1
continue
match = expensive_matchline(line)
if match is None:
if not quiet:
print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
filename, lineno, line)
malformed_lines += 1
continue
if match.group('agent') in skip_user_agent:
skipped_user_agent += 1
continue
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment