Commit d14743d9 authored by Vincent Pelletier's avatar Vincent Pelletier

Implement automatic period selection.

parent 2c0837f3
- use some templating system instead of hardcoded html strings - use some templating system instead of hardcoded html strings
- provide some form of raw data output, not just html - provide some form of raw data output, not just html
- allow user to specify min & max dates - allow user to specify min & max dates
- automatically select period from log data ?
...@@ -66,6 +66,7 @@ N_REFERRER_PER_ERROR_URL = 5 ...@@ -66,6 +66,7 @@ N_REFERRER_PER_ERROR_URL = 5
ITEMGETTER0 = itemgetter(0) ITEMGETTER0 = itemgetter(0)
ITEMGETTER1 = itemgetter(1) ITEMGETTER1 = itemgetter(1)
APDEX_TOLERATING_COEF = 4 APDEX_TOLERATING_COEF = 4
AUTO_PERIOD_COEF = 200
def statusIsError(status): def statusIsError(status):
return status[0] > '3' return status[0] > '3'
...@@ -260,6 +261,18 @@ class GenericSiteStats(object): ...@@ -260,6 +261,18 @@ class GenericSiteStats(object):
self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration)) self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
def rescale(self, convert, getDuration):
self.getDuration = getDuration
for status, date_dict in self.status.iteritems():
new_date_dict = defaultdict(int)
for date, status_count in date_dict.iteritems():
new_date_dict[convert(date)] += status_count
self.status[status] = new_date_dict
new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration))
for date, data in self.apdex.iteritems():
new_apdex[convert(date)].accumulateFrom(data)
self.apdex = new_apdex
def accumulate(self, match, url_match, date): def accumulate(self, match, url_match, date):
self.apdex[date].accumulate(match) self.apdex[date].accumulate(match)
if url_match is None: if url_match is None:
...@@ -381,6 +394,20 @@ class ERP5SiteStats(GenericSiteStats): ...@@ -381,6 +394,20 @@ class ERP5SiteStats(GenericSiteStats):
defaultdict, partial(APDEXStats, threshold, getDuration)))) defaultdict, partial(APDEXStats, threshold, getDuration))))
self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration)) self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration))
def rescale(self, convert, getDuration):
super(ERP5SiteStats, self).rescale(convert, getDuration)
threshold = self.threshold
for document_dict in self.module.itervalues():
for is_document, date_dict in document_dict.iteritems():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration))
for date, data in date_dict.iteritems():
new_date_dict[convert(date)].accumulateFrom(data)
document_dict[is_document] = new_date_dict
new_no_module = defaultdict(partial(APDEXStats, threshold, getDuration))
for date, data in self.no_module.iteritems():
new_no_module[convert(date)].accumulateFrom(data)
self.no_module = new_no_module
def accumulate(self, match, url_match, date): def accumulate(self, match, url_match, date):
split = self.suffix(url_match.group('url')).split('?', 1)[0].split('/') split = self.suffix(url_match.group('url')).split('?', 1)[0].split('/')
if split and split[0].endswith('_module'): if split and split[0].endswith('_module'):
...@@ -527,6 +554,9 @@ def _weekStringAsQuarterString(timestamp): ...@@ -527,6 +554,9 @@ def _weekStringAsQuarterString(timestamp):
year, month, _ = timestamp.split('/') year, month, _ = timestamp.split('/')
return '%s/%02i' % (year, int(month) / 3 * 3 + 1) return '%s/%02i' % (year, int(month) / 3 * 3 + 1)
def _roundWeek(dt):
return dt.replace(day=dt.day / 7 * 7 + 1)
def _asDayString(timestamp): def _asDayString(timestamp):
dt, _ = timestamp.split(' ') dt, _ = timestamp.split(' ')
day, month, year = dt.split(':', 1)[0].split('/') day, month, year = dt.split(':', 1)[0].split('/')
...@@ -539,6 +569,9 @@ def _as6HourString(timestamp): ...@@ -539,6 +569,9 @@ def _as6HourString(timestamp):
return '%s/%02i/%s %02i' % (year, MONTH_VALUE_DICT[month], day, return '%s/%02i/%s %02i' % (year, MONTH_VALUE_DICT[month], day,
int(hour) / 6 * 6) int(hour) / 6 * 6)
def _round6Hour(dt):
return dt.replace(hour=dt.hour / 6 * 6)
def _hourAsWeekString(timestamp): def _hourAsWeekString(timestamp):
dt = datetime.strptime(timestamp, '%Y/%m/%d %H') dt = datetime.strptime(timestamp, '%Y/%m/%d %H')
return (dt - timedelta(dt.weekday())).date().strftime('%Y/%m/%d') return (dt - timedelta(dt.weekday())).date().strftime('%Y/%m/%d')
...@@ -559,6 +592,8 @@ def _asHourString(timestamp): ...@@ -559,6 +592,8 @@ def _asHourString(timestamp):
# datetime.datetime instance # datetime.datetime instance
# - period during which a placeholder point will be added if there is no data # - period during which a placeholder point will be added if there is no data
# point # point
# - round a datetime.datetime instance so once represented using given format
# string it is a valid graph-granularity date for period
period_parser = { period_parser = {
'year': ( 'year': (
_asMonthString, _asMonthString,
...@@ -567,6 +602,7 @@ period_parser = { ...@@ -567,6 +602,7 @@ period_parser = {
'%Y/%m', '%Y/%m',
# Longest month: 31 days # Longest month: 31 days
timedelta(31), timedelta(31),
lambda x: x,
), ),
'quarter': ( 'quarter': (
_asWeekString, _asWeekString,
...@@ -576,6 +612,7 @@ period_parser = { ...@@ -576,6 +612,7 @@ period_parser = {
'7 days', '7 days',
'%Y/%m/%d', '%Y/%m/%d',
timedelta(7), timedelta(7),
_roundWeek,
), ),
'month': ( 'month': (
_asDayString, _asDayString,
...@@ -584,6 +621,7 @@ period_parser = { ...@@ -584,6 +621,7 @@ period_parser = {
'%Y/%m/%d', '%Y/%m/%d',
# Longest day: 24 hours + 1h DST (never more ?) # Longest day: 24 hours + 1h DST (never more ?)
timedelta(seconds=3600 * 25), timedelta(seconds=3600 * 25),
lambda x: x,
), ),
'week': ( 'week': (
_as6HourString, _as6HourString,
...@@ -591,6 +629,7 @@ period_parser = { ...@@ -591,6 +629,7 @@ period_parser = {
'6 hours', '6 hours',
'%Y/%m/%d %H', '%Y/%m/%d %H',
timedelta(seconds=3600 * 6), timedelta(seconds=3600 * 6),
_round6Hour,
), ),
'day': ( 'day': (
_asHourString, _asHourString,
...@@ -599,6 +638,7 @@ period_parser = { ...@@ -599,6 +638,7 @@ period_parser = {
'%Y/%m/%d %H', '%Y/%m/%d %H',
# Longest hour: 60 * 60 seconds + 1 leap second. # Longest hour: 60 * 60 seconds + 1 leap second.
timedelta(seconds=3601), timedelta(seconds=3601),
lambda x: x,
), ),
} }
...@@ -623,8 +663,12 @@ def main(): ...@@ -623,8 +663,12 @@ def main():
'Default: %(default).2fs') 'Default: %(default).2fs')
group.add_argument('-e', '--error-detail', action='store_true', group.add_argument('-e', '--error-detail', action='store_true',
help='Include detailed report (url & referers) for error statuses.') help='Include detailed report (url & referers) for error statuses.')
group.add_argument('-p', '--period', default='day', choices=period_parser, group.add_argument('-p', '--period', choices=period_parser,
help='Periodicity of sampling buckets. Default: %(default)r') help='Periodicity of sampling buckets. Default: (decide from data). '
'Performance note: leaving out this parameter reduces parsing '
'performance, as each period increase requires re-dispatching already '
'processed data. To mitigate this, provide earliest and latest log '
'files before all others (ex: log0 log3 log1 log2).')
group.add_argument('-s', '--stats', action='store_true', group.add_argument('-s', '--stats', action='store_true',
help='Enable parsing stats (time spent parsing input, time spent ' help='Enable parsing stats (time spent parsing input, time spent '
'generating output, ...)') 'generating output, ...)')
...@@ -690,8 +734,23 @@ def main(): ...@@ -690,8 +734,23 @@ def main():
assert not key, key assert not key, key
matchline = re.compile(line_regex).match matchline = re.compile(line_regex).match
matchrequest = REQUEST_PATTERN.match matchrequest = REQUEST_PATTERN.match
asDate, decimator, graph_period, date_format, placeholder_delta = \ if args.period is None:
period_parser[args.period] next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
sorted(period_parser.iteritems(), key=lambda x: x[1][4])).next
period, to_next_period = next_period_data()
earliest_date = latest_date = None
def getNextPeriod():
# datetime is slow (compared to string operations), but not many choices
return (datetime.strptime(earliest_date, date_format) + to_next_period
).strftime(date_format)
def rescale(x):
result = round_date(datetime.strptime(x, old_date_format)).strftime(date_format)
return result
else:
to_next_period = None
period = args.period
asDate, decimator, graph_period, date_format, placeholder_delta, \
round_date = period_parser[period]
site_list = args.path site_list = args.path
default_site = args.default default_site = args.default
if default_site is None: if default_site is None:
...@@ -757,6 +816,30 @@ def main(): ...@@ -757,6 +816,30 @@ def main():
skipped_lines += 1 skipped_lines += 1
continue continue
date = asDate(match.group('timestamp')) date = asDate(match.group('timestamp'))
if to_next_period is not None:
if date > latest_date: # '' > None is True
latest_date = date
if date < earliest_date or earliest_date is None:
earliest_date = date
next_period = getNextPeriod()
if latest_date > next_period:
try:
while latest_date > next_period:
period, to_next_period = next_period_data()
next_period = getNextPeriod()
except StopIteration:
pass
print >> sys.stderr, 'Increasing period to', period, '...',
old_date_format = date_format
asDate, decimator, graph_period, date_format, placeholder_delta, \
round_date = period_parser[period]
period_increase_start = time.time()
print old_date_format, date_format
for site_data in per_site.itervalues():
site_data.rescale(rescale, getDuration)
print >> sys.stderr, 'done (%s)' % timedelta(seconds=time.time()
- period_increase_start)
date = asDate(match.group('timestamp'))
try: try:
site_data = per_site[site] site_data = per_site[site]
except KeyError: except KeyError:
...@@ -793,7 +876,7 @@ def main(): ...@@ -793,7 +876,7 @@ def main():
'<table class="stats">') '<table class="stats">')
for caption, value in ( for caption, value in (
('apdex threshold', '%.2fs' % args.apdex), ('apdex threshold', '%.2fs' % args.apdex),
('period', args.period), ('period', args.period or (period + ' (auto)')),
): ):
out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % ( out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % (
caption, value)) caption, value))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment