Commit 4a9737a9 authored by Vincent Pelletier's avatar Vincent Pelletier

Toward Python3: unicode literals

Costs 30% performance on pypy.
parent b068f82d
......@@ -26,7 +26,8 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from __future__ import print_function, division, absolute_import
from __future__ import print_function, division, absolute_import, \
unicode_literals
from cgi import escape
from collections import defaultdict, Counter
from datetime import datetime, timedelta, date
......@@ -37,6 +38,7 @@ import argparse
import bz2
import calendar
import codecs
import functools
import gzip
import httplib
import itertools
......@@ -54,17 +56,57 @@ import traceback
def getResource(name, encoding='utf-8'):
return pkgutil.get_data(__name__, name).decode(encoding)
def _wrapOpen(func):
@functools.wraps(func)
def wrapper(*args, **kw):
encoding = kw.pop('encoding', None)
info = codecs.lookup(encoding)
errors = kw.pop('errors', 'strict')
file_object = func(*args, **kw)
if encoding is None:
return file_object
srw = codecs.StreamReaderWriter(
file_object,
info.streamreader,
info.streamwriter,
errors,
)
srw.encoding = encoding
return srw
return wrapper
lzma = None
gzip_open = gzip.open
if sys.version_info >= (3, 3):
import lzma
bz2_open = bz2.open
_read_mode = 'rt'
else:
open = codecs.open
gzip_open = _wrapOpen(gzip_open)
bz2_open = _wrapOpen(bz2.BZ2File)
_read_mode = 'r'
FILE_OPENER_LIST = [
(gzip.open, IOError),
(bz2.BZ2File, IOError),
(gzip_open, IOError),
(bz2_open, IOError),
]
if lzma is None:
try:
from backports import lzma
except ImportError:
pass
if lzma is not None:
FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError))
try:
from backports import lzma
except ImportError:
pass
# XXX: what encoding ? apache doesn't document one, but requests are supposed
# to be urlencoded, so pure ascii. Are timestamps localised ?
INPUT_ENCODING = 'ascii'
if sys.version_info < (3, ):
unquoteToHtml = lambda x: escape(unquote(x.encode('ascii')).decode('utf-8'))
else:
FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError))
unquoteToHtml = lambda x: escape(unquote(x))
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
......@@ -351,7 +393,7 @@ class GenericSiteStats(object):
reverse=True)[:N_SLOWEST]:
append('<tr>')
append(data.asHTML(self.threshold))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url, encoding))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url))
append('</table>')
append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>'
'<th>user agent</th></tr>')
......@@ -413,8 +455,8 @@ class GenericSiteStats(object):
append('<td>%s</td><td class="text">%s</td>'
'<td class="text">%s</td>' % (
getHitForUrl(referer_counter),
unquoteToHtml(url, encoding),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer, encoding))
unquoteToHtml(url),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer))
for referer, hit in referer_counter.most_common(
N_REFERRER_PER_ERROR_URL)),
))
......@@ -931,9 +973,6 @@ period_parser = {
),
}
unquoteToHtml = lambda x, encoding: escape(unquote(x).decode(encoding,
'replace'))
apdex_y_scale_dict = {
'linear': None,
'log': 'log100To0',
......@@ -980,8 +1019,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
key=lambda x: site_caption_dict[x[0]])))
html_site_caption_dict = {}
for i, (site_id, _) in site_list:
html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id],
encoding)
html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id])
if len(per_site) > 1:
out.write('<h2>Index</h2><ol>')
for i, (site_id, _) in site_list:
......@@ -1084,17 +1122,6 @@ format_generator = {
'json': (asJSON, 'ascii'),
}
# XXX: monkey-patching json module to emit strings instead of unicode objects.
# Because strings are faster, (30% overall performance hit moving to unicode
# objects), and only ASCII is expected (urlencoded is ASCII).
# Subclassing JSONDecoder is not enough as object parser uses scanstring
# directly.
original_scanstring = json.decoder.scanstring
def _scanstring(*args, **kw):
string, end = original_scanstring(*args, **kw)
return string.encode('ascii'), end
json.decoder.scanstring = _scanstring
def main():
parser = ShlexArgumentParser(description='Compute Apdex out of '
'apache-style log files', fromfile_prefix_chars='@')
......@@ -1246,7 +1273,7 @@ def main():
if state_file_name == '-':
state_file = sys.stdin
else:
state_file = open(state_file_name)
state_file = open(state_file_name, encoding='ascii')
with state_file:
load_start = time.time()
state = json.load(state_file)
......@@ -1289,7 +1316,7 @@ def main():
logfile = sys.stdin
else:
for opener, exc in FILE_OPENER_LIST:
logfile = opener(filename)
logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING)
try:
logfile.readline()
except exc:
......@@ -1298,7 +1325,7 @@ def main():
logfile.seek(0)
break
else:
logfile = open(filename)
logfile = open(filename, _read_mode, encoding=INPUT_ENCODING)
lineno = 0
for lineno, line in enumerate(logfile, 1):
if show_progress and lineno % 5000 == 0:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment