Commit e804ffc3 authored by Vincent Pelletier's avatar Vincent Pelletier

Add a parameter to filter out given user agents.

parent 1c37c536
- use some templating system instead of hardcoded html strings
- provide some form of raw data output, not just html
- allow user to specify min & max dates
- filter(out ?) by user agent
- automatically select period from log data ?
......@@ -34,6 +34,7 @@ from operator import itemgetter
from urllib import splittype, splithost
import argparse
import gzip
import itertools
import json
import math
import os
......@@ -477,6 +478,11 @@ def main():
action=AggregateSiteUrl,
help='Absolute base url(s) to ignore.')
group = parser.add_argument_group('filtering')
group.add_argument('--skip-user-agent', nargs='+', default=[],
action='append', help='List of user agents from which hits should be '
'ignored. Useful to exclude monitoring systems.')
args = parser.parse_args()
abs_file_container = getattr(args, 'js', abs_file_container)
if DURATION_US_FORMAT in args.logformat:
......@@ -528,10 +534,12 @@ def main():
file_count = len(infile_list)
per_site = {}
hit_per_day = defaultdict(int)
skip_user_agent = list(itertools.chain(*args.skip_user_agent))
malformed_lines = 0
skipped_lines = 0
no_url_lines = 0
all_lines = 0
skipped_user_agent = 0
start_time = time.time()
for fileno, filename in enumerate(infile_list, 1):
print >> sys.stderr, 'Processing %s [%i/%i]' % (
......@@ -555,6 +563,9 @@ def main():
filename, lineno, line)
malformed_lines += 1
continue
if match.group('agent') in skip_user_agent:
skipped_user_agent += 1
continue
url_match = matchrequest(match.group('request'))
if url_match is None:
no_url_lines += 1
......@@ -711,7 +722,8 @@ def main():
('Lines', all_lines),
('... malformed', malformed_lines),
('... URL-less', no_url_lines),
('... skipped', skipped_lines),
('... skipped (URL)', skipped_lines),
('... skipped (user agent)', skipped_user_agent),
('Parsing time', timedelta(seconds=parsing_time)),
('Parsing rate', '%i line/s' % (all_lines / parsing_time)),
('Rendering time', timedelta(seconds=(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment