Commit e804ffc3 authored by Vincent Pelletier's avatar Vincent Pelletier

Add a parameter to filter out given user agents.

parent 1c37c536
- use some templating system instead of hardcoded html strings - use some templating system instead of hardcoded html strings
- provide some form of raw data output, not just html - provide some form of raw data output, not just html
- allow user to specify min & max dates - allow user to specify min & max dates
- filter(out ?) by user agent
- automatically select period from log data ? - automatically select period from log data ?
...@@ -34,6 +34,7 @@ from operator import itemgetter ...@@ -34,6 +34,7 @@ from operator import itemgetter
from urllib import splittype, splithost from urllib import splittype, splithost
import argparse import argparse
import gzip import gzip
import itertools
import json import json
import math import math
import os import os
...@@ -477,6 +478,11 @@ def main(): ...@@ -477,6 +478,11 @@ def main():
action=AggregateSiteUrl, action=AggregateSiteUrl,
help='Absolute base url(s) to ignore.') help='Absolute base url(s) to ignore.')
group = parser.add_argument_group('filtering')
group.add_argument('--skip-user-agent', nargs='+', default=[],
action='append', help='List of user agents from which hits should be '
'ignored. Useful to exclude monitoring systems.')
args = parser.parse_args() args = parser.parse_args()
abs_file_container = getattr(args, 'js', abs_file_container) abs_file_container = getattr(args, 'js', abs_file_container)
if DURATION_US_FORMAT in args.logformat: if DURATION_US_FORMAT in args.logformat:
...@@ -528,10 +534,12 @@ def main(): ...@@ -528,10 +534,12 @@ def main():
file_count = len(infile_list) file_count = len(infile_list)
per_site = {} per_site = {}
hit_per_day = defaultdict(int) hit_per_day = defaultdict(int)
skip_user_agent = list(itertools.chain(*args.skip_user_agent))
malformed_lines = 0 malformed_lines = 0
skipped_lines = 0 skipped_lines = 0
no_url_lines = 0 no_url_lines = 0
all_lines = 0 all_lines = 0
skipped_user_agent = 0
start_time = time.time() start_time = time.time()
for fileno, filename in enumerate(infile_list, 1): for fileno, filename in enumerate(infile_list, 1):
print >> sys.stderr, 'Processing %s [%i/%i]' % ( print >> sys.stderr, 'Processing %s [%i/%i]' % (
...@@ -555,6 +563,9 @@ def main(): ...@@ -555,6 +563,9 @@ def main():
filename, lineno, line) filename, lineno, line)
malformed_lines += 1 malformed_lines += 1
continue continue
if match.group('agent') in skip_user_agent:
skipped_user_agent += 1
continue
url_match = matchrequest(match.group('request')) url_match = matchrequest(match.group('request'))
if url_match is None: if url_match is None:
no_url_lines += 1 no_url_lines += 1
...@@ -711,7 +722,8 @@ def main(): ...@@ -711,7 +722,8 @@ def main():
('Lines', all_lines), ('Lines', all_lines),
('... malformed', malformed_lines), ('... malformed', malformed_lines),
('... URL-less', no_url_lines), ('... URL-less', no_url_lines),
('... skipped', skipped_lines), ('... skipped (URL)', skipped_lines),
('... skipped (user agent)', skipped_user_agent),
('Parsing time', timedelta(seconds=parsing_time)), ('Parsing time', timedelta(seconds=parsing_time)),
('Parsing rate', '%i line/s' % (all_lines / parsing_time)), ('Parsing rate', '%i line/s' % (all_lines / parsing_time)),
('Rendering time', timedelta(seconds=( ('Rendering time', timedelta(seconds=(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment