Commit e8f89f12 authored by Jason Madden's avatar Jason Madden

Encoding fixes for WSGI.

See the detailed changelog entry. This brings correctness, debugging,
and performance benefits. It further matches the already documented
and specified restrictions on application-provided header and status
line values.
parent 1d32a6a4
...@@ -15,6 +15,22 @@ ...@@ -15,6 +15,22 @@
callable, which, depending on the order of imports, could be broken callable, which, depending on the order of imports, could be broken
after the addition of the ``gevent.signal`` module. Reported in after the addition of the ``gevent.signal`` module. Reported in
:issue:`648` by Sylvain Zimmer. :issue:`648` by Sylvain Zimmer.
- ``gevent.pywsgi.WSGIServer`` does a better job detecting and
reporting potential encoding errors for headers and the status line
during ``start_response`` as recommended by `WSGI specification`_.
In addition, under Python 2, unnecessary encodings and decodings
(often a trip through the ASCII encoding) are avoided for conforming
applications. This is an enhancement of an already documented and
partially enforced constraint: beginning in 1.1a1, under Python 2,
``u'abc'`` would typically previously have been allowed, but
``u'\u1f4a3'`` would not; now, neither will be allowed, more closely
matching the specification, improving debugability and performance
and allowing for better error handling both by the application and
by gevent (previously, certain encoding errors could result in
gevent writing invalid/malformed HTTP responses). Reported by Greg
Higgins and Carlos Sanchez.
.. _WSGI specification: https://www.python.org/dev/peps/pep-3333/#the-start-response-callable
1.1b4 (Sep 4, 2015) 1.1b4 (Sep 4, 2015)
=================== ===================
......
...@@ -184,8 +184,9 @@ reduce the cases of undocumented or non-standard behaviour. ...@@ -184,8 +184,9 @@ reduce the cases of undocumented or non-standard behaviour.
.. _does not use a reference-counted GC: http://doc.pypy.org/en/latest/cpython_differences.html#differences-related-to-garbage-collection-strategies .. _does not use a reference-counted GC: http://doc.pypy.org/en/latest/cpython_differences.html#differences-related-to-garbage-collection-strategies
- :class:`gevent.pywsgi.WSGIServer` ensures that headers set by the - :class:`gevent.pywsgi.WSGIServer` ensures that headers and the
application can be encoded in the ISO-8859-1 charset. status line set by the application can be encoded in the ISO-8859-1
(Latin-1) charset and are of the *native string type*.
Under gevent 1.0, non-``bytes`` headers (that is, ``unicode`` since Under gevent 1.0, non-``bytes`` headers (that is, ``unicode`` since
gevent 1.0 only ran on Python 2) were encoded according to the gevent 1.0 only ran on Python 2) were encoded according to the
......
...@@ -63,8 +63,14 @@ _CONTINUE_RESPONSE = b"HTTP/1.1 100 Continue\r\n\r\n" ...@@ -63,8 +63,14 @@ _CONTINUE_RESPONSE = b"HTTP/1.1 100 Continue\r\n\r\n"
def format_date_time(timestamp): def format_date_time(timestamp):
# Return a byte-string of the date and time in HTTP format
# .. versionchanged:: 1.1b5
# Return a byte string, not a native string
year, month, day, hh, mm, ss, wd, _y, _z = time.gmtime(timestamp) year, month, day, hh, mm, ss, wd, _y, _z = time.gmtime(timestamp)
return "%s, %02d %3s %4d %02d:%02d:%02d GMT" % (_WEEKDAYNAME[wd], day, _MONTHNAME[month], year, hh, mm, ss) value = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % (_WEEKDAYNAME[wd], day, _MONTHNAME[month], year, hh, mm, ss)
if PY3:
value = value.encode("latin-1")
return value
class _InvalidClientInput(IOError): class _InvalidClientInput(IOError):
...@@ -325,6 +331,11 @@ class WSGIHandler(object): ...@@ -325,6 +331,11 @@ class WSGIHandler(object):
else: else:
self.rfile = rfile self.rfile = rfile
# Instance variables added later:
# self.response_headers: A list of tuples of bytes: [(b'Header', b'value')]
# self.status: The bytes of the status line: b'200 ok'
# self.code: An integer giving the HTTP status: 200
def handle(self): def handle(self):
""" """
The main request handling method, called by the server. The main request handling method, called by the server.
...@@ -509,17 +520,21 @@ class WSGIHandler(object): ...@@ -509,17 +520,21 @@ class WSGIHandler(object):
def finalize_headers(self): def finalize_headers(self):
if self.provided_date is None: if self.provided_date is None:
self.response_headers.append(('Date', format_date_time(time.time()))) self.response_headers.append((b'Date', format_date_time(time.time())))
if self.code not in (304, 204): if self.code not in (304, 204):
# the reply will include message-body; make sure we have either Content-Length or chunked # the reply will include message-body; make sure we have either Content-Length or chunked
if self.provided_content_length is None: if self.provided_content_length is None:
if hasattr(self.result, '__len__'): if hasattr(self.result, '__len__'):
self.response_headers.append(('Content-Length', str(sum(len(chunk) for chunk in self.result)))) total_len = sum(len(chunk) for chunk in self.result)
total_len_str = str(total_len)
if PY3:
total_len_str = total_len_str.encode("latin-1")
self.response_headers.append((b'Content-Length', total_len_str))
else: else:
if self.request_version != 'HTTP/1.0': if self.request_version != 'HTTP/1.0':
self.response_use_chunked = True self.response_use_chunked = True
self.response_headers.append(('Transfer-Encoding', 'chunked')) self.response_headers.append((b'Transfer-Encoding', b'chunked'))
def _sendall(self, data): def _sendall(self, data):
try: try:
...@@ -555,9 +570,15 @@ class WSGIHandler(object): ...@@ -555,9 +570,15 @@ class WSGIHandler(object):
self.headers_sent = True self.headers_sent = True
self.finalize_headers() self.finalize_headers()
towrite.extend(('HTTP/1.1 %s\r\n' % self.status).encode('latin-1')) # self.response_headers and self.status are already in latin-1, as encoded by self.start_response
for header in self.response_headers: towrite.extend(b'HTTP/1.1 ')
towrite.extend(('%s: %s\r\n' % header).encode('latin-1')) towrite.extend(self.status)
towrite.extend(b'\r\n')
for header, value in self.response_headers:
towrite.extend(header)
towrite.extend(b': ')
towrite.extend(value)
towrite.extend(b"\r\n")
towrite.extend(b'\r\n') towrite.extend(b'\r\n')
if data: if data:
...@@ -570,10 +591,11 @@ class WSGIHandler(object): ...@@ -570,10 +591,11 @@ class WSGIHandler(object):
try: try:
towrite.extend(data) towrite.extend(data)
except TypeError: except TypeError:
raise TypeError("Not an bytestring", data) raise TypeError("Not a bytestring", data)
self._sendall(towrite) self._sendall(towrite)
def start_response(self, status, headers, exc_info=None): def start_response(self, status, headers, exc_info=None):
# .. versionchanged:: 1.1b5 handle header/status encoding here
if exc_info: if exc_info:
try: try:
if self.headers_sent: if self.headers_sent:
...@@ -582,9 +604,54 @@ class WSGIHandler(object): ...@@ -582,9 +604,54 @@ class WSGIHandler(object):
finally: finally:
# Avoid dangling circular ref # Avoid dangling circular ref
exc_info = None exc_info = None
self.code = int(status.split(' ', 1)[0])
self.status = status # Pep 3333, "The start_response callable":
self.response_headers = headers # https://www.python.org/dev/peps/pep-3333/#the-start-response-callable
# "Servers should check for errors in the headers at the time
# start_response is called, so that an error can be raised
# while the application is still running." Here, we check the encoding.
# This aids debuging: headers especially are generated programatically
# and an encoding error in a loop or list comprehension yields an opaque
# UnicodeError without any clue which header was wrong.
# Note that this results in copying the header list at this point, not modifying it,
# although we are allowed to do so if needed. This slightly increases memory usage.
response_headers = []
header = None
value = None
try:
for header, value in headers:
if not isinstance(header, str):
raise UnicodeError("The header must be a native string", header, value)
if not isinstance(value, str):
raise UnicodeError("The value must be a native string", header, value)
# Either we're on Python 2, in which case bytes is correct, or
# we're on Python 3 and the user screwed up (because it should be a native
# string). In either case, make sure that this is latin-1 compatible. Under
# Python 2, bytes.encode() will take a round-trip through the system encoding,
# which may be ascii, which is not really what we want. However, the latin-1 encoding
# can encode everything except control characters and the block from 0x7F to 0x9F, so
# explicitly round-tripping bytes through the encoding is unlikely to be of much
# benefit, so we go for speed (the WSGI spec specifically calls out allowing the range
# from 0x00 to 0xFF, although the HTTP spec forbids the control characters).
# Note: Some Python 2 implementations, like Jython, may allow non-octet (above 255) values
# in their str implementation; this is mentioned in the WSGI spec, but we don't
# run on any platform like that so we can assume that a str value is pure bytes.
response_headers.append((header if not PY3 else header.encode("latin-1"),
value if not PY3 else value.encode("latin-1")))
except UnicodeEncodeError:
# If we get here, we're guaranteed to have a header and value
raise UnicodeError("Non-latin1 header", header, value)
# Same as above
if not isinstance(status, str):
raise UnicodeError("The status string must be a native string")
# don't assign to anything until the validation is complete, including parsing the
# code
code = int(status.split(' ', 1)[0])
self.status = status if not PY3 else status.encode("latin-1")
self.response_headers = response_headers
self.code = code
provided_connection = None provided_connection = None
self.provided_date = None self.provided_date = None
...@@ -600,7 +667,7 @@ class WSGIHandler(object): ...@@ -600,7 +667,7 @@ class WSGIHandler(object):
self.provided_content_length = value self.provided_content_length = value
if self.request_version == 'HTTP/1.0' and provided_connection is None: if self.request_version == 'HTTP/1.0' and provided_connection is None:
headers.append(('Connection', 'close')) response_headers.append((b'Connection', b'close'))
self.close_connection = True self.close_connection = True
elif provided_connection == 'close': elif provided_connection == 'close':
self.close_connection = True self.close_connection = True
...@@ -700,9 +767,14 @@ class WSGIHandler(object): ...@@ -700,9 +767,14 @@ class WSGIHandler(object):
self.start_response(status, headers[:]) self.start_response(status, headers[:])
self.write(body) self.write(body)
def handle_error(self, type, value, tb): def _log_error(self, t, v, tb):
if not issubclass(type, GreenletExit): # TODO: Shouldn't we dump this to wsgi.errors? If we did that now, it would
self.server.loop.handle_error(self.environ, type, value, tb) # wind up getting logged twice
if not issubclass(t, GreenletExit):
self.server.loop.handle_error(self.environ, t, v, tb)
def handle_error(self, t, v, tb):
self._log_error(t, v, tb)
del tb del tb
self._send_error_response_if_possible(500) self._send_error_response_if_possible(500)
......
...@@ -21,7 +21,11 @@ from __future__ import print_function ...@@ -21,7 +21,11 @@ from __future__ import print_function
from gevent import monkey from gevent import monkey
monkey.patch_all(thread=False) monkey.patch_all(thread=False)
import cgi try:
from urllib.parse import parse_qs
except ImportError:
# Python 2
from cgi import parse_qs
import os import os
import sys import sys
try: try:
...@@ -118,13 +122,13 @@ class Response(object): ...@@ -118,13 +122,13 @@ class Response(object):
self.chunks = False self.chunks = False
try: try:
version, code, self.reason = status_line[:-2].split(' ', 2) version, code, self.reason = status_line[:-2].split(' ', 2)
self.code = int(code)
HTTP, self.version = version.split('/')
assert HTTP == 'HTTP', repr(HTTP)
assert self.version in ('1.0', '1.1'), repr(self.version)
except Exception: except Exception:
print('Error: %r' % status_line) print('Error: %r' % status_line)
raise raise
self.code = int(code)
HTTP, self.version = version.split('/')
assert HTTP == 'HTTP', repr(HTTP)
assert self.version in ('1.0', '1.1'), repr(self.version)
def __iter__(self): def __iter__(self):
yield self.status_line yield self.status_line
...@@ -235,9 +239,13 @@ class TestCase(greentest.TestCase): ...@@ -235,9 +239,13 @@ class TestCase(greentest.TestCase):
validator = staticmethod(validator) validator = staticmethod(validator)
application = None application = None
def init_server(self, application): def init_logger(self):
import logging import logging
logger = logging.getLogger('gevent.pywsgi') logger = logging.getLogger('gevent.pywsgi')
return logger
def init_server(self, application):
logger = self.logger = self.init_logger()
self.server = pywsgi.WSGIServer(('127.0.0.1', 0), application, self.server = pywsgi.WSGIServer(('127.0.0.1', 0), application,
log=logger, error_log=logger) log=logger, error_log=logger)
...@@ -480,7 +488,7 @@ class TestGetArg(TestCase): ...@@ -480,7 +488,7 @@ class TestGetArg(TestCase):
body = env['wsgi.input'].read(3) body = env['wsgi.input'].read(3)
if PY3: if PY3:
body = body.decode('ascii') body = body.decode('ascii')
a = cgi.parse_qs(body).get('a', [1])[0] a = parse_qs(body).get('a', [1])[0]
start_response('200 OK', [('Content-Type', 'text/plain')]) start_response('200 OK', [('Content-Type', 'text/plain')])
return [('a is %s, body is %s' % (a, body)).encode('ascii')] return [('a is %s, body is %s' % (a, body)).encode('ascii')]
...@@ -738,6 +746,49 @@ Connection: close ...@@ -738,6 +746,49 @@ Connection: close
read_http(sock.makefile(), reason='PASSED', chunks=False, body='', content_length=0) read_http(sock.makefile(), reason='PASSED', chunks=False, body='', content_length=0)
class TestNonLatin1HeaderFromApplication(TestCase):
error_fatal = False # Allow sending the exception response, don't kill the greenlet
validator = None # Don't validate the application, it's deliberately bad
header = b'\xe1\xbd\x8a3' # bomb in utf-8 bytes
should_error = PY3 # non-native string under Py3
def init_server(self, app):
TestCase.init_server(self, app)
self.errors = list()
def application(self, environ, start_response):
# We return a header that cannot be encoded in latin-1
try:
start_response("200 PASSED",
[('Content-Type', 'text/plain'),
('Custom-Header', self.header)])
except:
self.errors.append(sys.exc_info()[:2])
raise
return []
def test(self):
sock = self.connect()
sock.sendall(b'''GET / HTTP/1.1\r\n\r\n''')
if self.should_error:
read_http(sock.makefile(), code=500, reason='Internal Server Error')
self.assertEqual(len(self.errors), 1)
t, v = self.errors[0]
self.assertTrue(isinstance(v, UnicodeError))
else:
read_http(sock.makefile(), code=200, reason='PASSED')
self.assertEqual(len(self.errors), 0)
class TestNonLatin1UnicodeHeaderFromApplication(TestNonLatin1HeaderFromApplication):
# Flip-flop of the superclass: Python 3 native string, Python 2 unicode object
header = u"\u1f4a3" # bomb in unicode
# Error both on py3 and py2. On py2, non-native string. On py3, native string
# that cannot be encoded to latin-1
should_error = True
class TestInputReadline(TestCase): class TestInputReadline(TestCase):
# this test relies on the fact that readline() returns '' after it reached EOF # this test relies on the fact that readline() returns '' after it reached EOF
# this behaviour is not mandated by WSGI spec, it's just happens that gevent.wsgi behaves like that # this behaviour is not mandated by WSGI spec, it's just happens that gevent.wsgi behaves like that
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment