Commit ddb41afa authored by Vincent Pelletier's avatar Vincent Pelletier

Add parsing stats.

parent 7b7271f9
......@@ -42,6 +42,7 @@ import gzip
import os
import re
import sys
import time
try:
import matplotlib
......@@ -352,6 +353,8 @@ def main():
help='Periodicity of sampling buckets. Default: %(default)r')
parser.add_argument('-q', '--quiet', action='store_true',
help='Suppress warnings about malformed lines.')
parser.add_argument('-s', '--stats', action='store_true',
help='Enable parsing stats (time taken to parse all files, ...)')
args = parser.parse_args()
line_regex = ''
try:
......@@ -388,6 +391,11 @@ def main():
file_count = len(infile_list)
per_site = {}
hit_per_day = defaultdict(int)
malformed_lines = 0
skipped_lines = 0
no_url_lines = 0
all_lines = 0
start_time = time.time()
for fileno, filename in enumerate(infile_list, 1):
print >> sys.stderr, 'Processing %s [%i/%i]' % (
filename, fileno, file_count)
......@@ -407,9 +415,11 @@ def main():
if not quiet:
print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
filename, lineno, line)
malformed_lines += 1
continue
url_match = matchrequest(match.group('request'))
if url_match is None:
no_url_lines += 1
continue
url = url_match.group('url')
if url.startswith('http'):
......@@ -422,6 +432,7 @@ def main():
site = default_site
action = default_action
if action is None:
skipped_lines += 1
continue
utcdate = asDate(match.group('timestamp'))
hit_per_day[utcdate] += 1
......@@ -430,9 +441,11 @@ def main():
except KeyError:
site_data = per_site[site] = action(threshold)
site_data.accumulate(match, url_match, utcdate)
all_lines += lineno
end_parsing_time = time.time()
os.chdir(args.out)
with open('index.html', 'w') as out:
out.write('<html><head><title>Stats</title><style>th, td { border: solid 1px #000; } th { text-align: center; } td { text-align: right; } td.text { text-align: left; } table { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } </style></head><body><h1>Overall</h1><h2>Hits per day</h2><table><tr><th>date</th><th>hits</th></tr>')
out.write('<html><head><title>Stats</title><style>th, td { border: solid 1px #000; } th { text-align: center; } td { text-align: right; } th.text, td.text { text-align: left; } table { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } </style></head><body><h1>Overall</h1><h2>Hits per day</h2><table><tr><th>date</th><th>hits</th></tr>')
for date, hit in sorted(hit_per_day.iteritems(), key=ITEMGETTER0):
out.write('<tr><td>%s</td><td>%s</td></tr>' % (date, hit))
out.write('</table>')
......@@ -473,6 +486,22 @@ def main():
out.write('<img src="' + plot_filename + '" />')
out.write(data.asHTML())
end_stat_time = time.time()
if args.stats:
out.write('<h1>Parsing stats</h1><table>')
for caption, value in (
('Execution date', datetime.now().isoformat()),
('File count', file_count),
('Lines', all_lines),
('... malformed', malformed_lines),
('... URL-less', no_url_lines),
('... skipped', skipped_lines),
('Parsing time', '%.2fs' % (end_parsing_time - start_time)),
('Rendering time', '%.2fs' % (end_stat_time - end_parsing_time)),
):
out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % (
caption, value))
out.write('</table>')
out.write('</body></html>')
if __name__ == '__main__':
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment