Encoding fixes for WSGI.

See the detailed changelog entry. This brings correctness, debugging, and performance benefits. It further matches the already documented and specified restrictions on application-provided header and status line values.

Encoding fixes for WSGI.
See the detailed changelog entry. This brings correctness, debugging, and performance benefits. It further matches the already documented and specified restrictions on application-provided header and status line values.
e8f89f12 · Jason Madden · 1d32a6a4 · e8f89f12 · e8f89f12 · e8f89f12
Commit e8f89f12 authored Sep 15, 2015 by Jason Madden
Showing with 164 additions and 24 deletions

changelog.rst changelog.rst +16 -0

doc/whatsnew_1_1.rst doc/whatsnew_1_1.rst +3 -2

gevent/pywsgi.py gevent/pywsgi.py +87 -15

greentest/test__pywsgi.py greentest/test__pywsgi.py +58 -7

No files found.
--- a/changelog.rst
+++ b/changelog.rst
@@ -15,6 +15,22 @@
  callable, which, depending on the order of imports, could be broken
  after the addition of the ``gevent.signal`` module. Reported in
  :issue:`648` by Sylvain Zimmer.
+- ``gevent.pywsgi.WSGIServer`` does a better job detecting and
+  reporting potential encoding errors for headers and the status line
+  during ``start_response`` as recommended by `WSGI specification`_.
+  In addition, under Python 2, unnecessary encodings and decodings
+  (often a trip through the ASCII encoding) are avoided for conforming
+  applications. This is an enhancement of an already documented and
+  partially enforced constraint: beginning in 1.1a1, under Python 2,
+  ``u'abc'`` would typically previously have been allowed, but
+  ``u'\u1f4a3'`` would not; now, neither will be allowed, more closely
+  matching the specification, improving debugability and performance
+  and allowing for better error handling both by the application and
+  by gevent (previously, certain encoding errors could result in
+  gevent writing invalid/malformed HTTP responses). Reported by Greg
+  Higgins and Carlos Sanchez.
+
+.. _WSGI specification: https://www.python.org/dev/peps/pep-3333/#the-start-response-callable

 1.1b4 (Sep 4, 2015)
 ===================

--- a/doc/whatsnew_1_1.rst
+++ b/doc/whatsnew_1_1.rst
@@ -184,8 +184,9 @@ reduce the cases of undocumented or non-standard behaviour.

 .. _does not use a reference-counted GC: http://doc.pypy.org/en/latest/cpython_differences.html#differences-related-to-garbage-collection-strategies

- :class:`gevent.pywsgi.WSGIServer` ensures that headers set by the
-  application can be encoded in the ISO-8859-1 charset.
+- :class:`gevent.pywsgi.WSGIServer` ensures that headers and the
+  status line set by the application can be encoded in the ISO-8859-1
+  (Latin-1) charset and are of the *native string type*.

  Under gevent 1.0, non-``bytes`` headers (that is, ``unicode`` since
  gevent 1.0 only ran on Python 2) were encoded according to the

--- a/gevent/pywsgi.py
+++ b/gevent/pywsgi.py
@@ -63,8 +63,14 @@ _CONTINUE_RESPONSE = b"HTTP/1.1 100 Continue\r\n\r\n"


 def format_date_time(timestamp):
+    # Return a byte-string of the date and time in HTTP format
+    # .. versionchanged:: 1.1b5
+    #  Return a byte string, not a native string
    year, month, day, hh, mm, ss, wd, _y, _z = time.gmtime(timestamp)
-    return "%s, %02d %3s %4d %02d:%02d:%02d GMT" % (_WEEKDAYNAME[wd], day, _MONTHNAME[month], year, hh, mm, ss)
+    value = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % (_WEEKDAYNAME[wd], day, _MONTHNAME[month], year, hh, mm, ss)
+    if PY3:
+        value = value.encode("latin-1")
+    return value


 class _InvalidClientInput(IOError):
@@ -325,6 +331,11 @@ class WSGIHandler(object):
        else:
            self.rfile = rfile

+        # Instance variables added later:
+        # self.response_headers: A list of tuples of bytes: [(b'Header', b'value')]
+        # self.status: The bytes of the status line: b'200 ok'
+        # self.code: An integer giving the HTTP status: 200
+
    def handle(self):
        """
        The main request handling method, called by the server.
@@ -509,17 +520,21 @@ class WSGIHandler(object):

    def finalize_headers(self):
        if self.provided_date is None:
-            self.response_headers.append(('Date', format_date_time(time.time())))
+            self.response_headers.append((b'Date', format_date_time(time.time())))

        if self.code not in (304, 204):
            # the reply will include message-body; make sure we have either Content-Length or chunked
            if self.provided_content_length is None:
                if hasattr(self.result, '__len__'):
-                    self.response_headers.append(('Content-Length', str(sum(len(chunk) for chunk in self.result))))
+                    total_len = sum(len(chunk) for chunk in self.result)
+                    total_len_str = str(total_len)
+                    if PY3:
+                        total_len_str = total_len_str.encode("latin-1")
+                    self.response_headers.append((b'Content-Length', total_len_str))
                else:
                    if self.request_version != 'HTTP/1.0':
                        self.response_use_chunked = True
-                        self.response_headers.append(('Transfer-Encoding', 'chunked'))
+                        self.response_headers.append((b'Transfer-Encoding', b'chunked'))

    def _sendall(self, data):
        try:
@@ -555,9 +570,15 @@ class WSGIHandler(object):
        self.headers_sent = True
        self.finalize_headers()

-        towrite.extend(('HTTP/1.1 %s\r\n' % self.status).encode('latin-1'))
-        for header in self.response_headers:
-            towrite.extend(('%s: %s\r\n' % header).encode('latin-1'))
+        # self.response_headers and self.status are already in latin-1, as encoded by self.start_response
+        towrite.extend(b'HTTP/1.1 ')
+        towrite.extend(self.status)
+        towrite.extend(b'\r\n')
+        for header, value in self.response_headers:
+            towrite.extend(header)
+            towrite.extend(b': ')
+            towrite.extend(value)
+            towrite.extend(b"\r\n")

        towrite.extend(b'\r\n')
        if data:
@@ -570,10 +591,11 @@ class WSGIHandler(object):
                try:
                    towrite.extend(data)
                except TypeError:
-                    raise TypeError("Not an bytestring", data)
+                    raise TypeError("Not a bytestring", data)
        self._sendall(towrite)

    def start_response(self, status, headers, exc_info=None):
+        # .. versionchanged:: 1.1b5 handle header/status encoding here
        if exc_info:
            try:
                if self.headers_sent:
@@ -582,9 +604,54 @@ class WSGIHandler(object):
            finally:
                # Avoid dangling circular ref
                exc_info = None
-        self.code = int(status.split(' ', 1)[0])
-        self.status = status
-        self.response_headers = headers
+
+        # Pep 3333, "The start_response callable":
+        # https://www.python.org/dev/peps/pep-3333/#the-start-response-callable
+        # "Servers should check for errors in the headers at the time
+        # start_response is called, so that an error can be raised
+        # while the application is still running." Here, we check the encoding.
+        # This aids debuging: headers especially are generated programatically
+        # and an encoding error in a loop or list comprehension yields an opaque
+        # UnicodeError without any clue which header was wrong.
+        # Note that this results in copying the header list at this point, not modifying it,
+        # although we are allowed to do so if needed. This slightly increases memory usage.
+        response_headers = []
+        header = None
+        value = None
+        try:
+            for header, value in headers:
+                if not isinstance(header, str):
+                    raise UnicodeError("The header must be a native string", header, value)
+                if not isinstance(value, str):
+                    raise UnicodeError("The value must be a native string", header, value)
+                # Either we're on Python 2, in which case bytes is correct, or
+                # we're on Python 3 and the user screwed up (because it should be a native
+                # string). In either case, make sure that this is latin-1 compatible. Under
+                # Python 2, bytes.encode() will take a round-trip through the system encoding,
+                # which may be ascii, which is not really what we want. However, the latin-1 encoding
+                # can encode everything except control characters and the block from 0x7F to 0x9F, so
+                # explicitly round-tripping bytes through the encoding is unlikely to be of much
+                # benefit, so we go for speed (the WSGI spec specifically calls out allowing the range
+                # from 0x00 to 0xFF, although the HTTP spec forbids the control characters).
+                # Note: Some Python 2 implementations, like Jython, may allow non-octet (above 255) values
+                # in their str implementation; this is mentioned in the WSGI spec, but we don't
+                # run on any platform like that so we can assume that a str value is pure bytes.
+                response_headers.append((header if not PY3 else header.encode("latin-1"),
+                                         value if not PY3 else value.encode("latin-1")))
+        except UnicodeEncodeError:
+            # If we get here, we're guaranteed to have a header and value
+            raise UnicodeError("Non-latin1 header", header, value)
+
+        # Same as above
+        if not isinstance(status, str):
+            raise UnicodeError("The status string must be a native string")
+        # don't assign to anything until the validation is complete, including parsing the
+        # code
+        code = int(status.split(' ', 1)[0])
+
+        self.status = status if not PY3 else status.encode("latin-1")
+        self.response_headers = response_headers
+        self.code = code

        provided_connection = None
        self.provided_date = None
@@ -600,7 +667,7 @@ class WSGIHandler(object):
                self.provided_content_length = value

        if self.request_version == 'HTTP/1.0' and provided_connection is None:
-            headers.append(('Connection', 'close'))
+            response_headers.append((b'Connection', b'close'))
            self.close_connection = True
        elif provided_connection == 'close':
            self.close_connection = True
@@ -700,9 +767,14 @@ class WSGIHandler(object):
            self.start_response(status, headers[:])
            self.write(body)

-    def handle_error(self, type, value, tb):
-        if not issubclass(type, GreenletExit):
-            self.server.loop.handle_error(self.environ, type, value, tb)
+    def _log_error(self, t, v, tb):
+        # TODO: Shouldn't we dump this to wsgi.errors? If we did that now, it would
+        # wind up getting logged twice
+        if not issubclass(t, GreenletExit):
+            self.server.loop.handle_error(self.environ, t, v, tb)
+
+    def handle_error(self, t, v, tb):
+        self._log_error(t, v, tb)
        del tb
        self._send_error_response_if_possible(500)


--- a/greentest/test__pywsgi.py
+++ b/greentest/test__pywsgi.py
@@ -21,7 +21,11 @@ from __future__ import print_function
 from gevent import monkey
 monkey.patch_all(thread=False)

-import cgi
+try:
+    from urllib.parse import parse_qs
+except ImportError:
+    # Python 2
+    from cgi import parse_qs
 import os
 import sys
 try:
@@ -118,13 +122,13 @@ class Response(object):
        self.chunks = False
        try:
            version, code, self.reason = status_line[:-2].split(' ', 2)
+            self.code = int(code)
+            HTTP, self.version = version.split('/')
+            assert HTTP == 'HTTP', repr(HTTP)
+            assert self.version in ('1.0', '1.1'), repr(self.version)
        except Exception:
            print('Error: %r' % status_line)
            raise
-        self.code = int(code)
-        HTTP, self.version = version.split('/')
-        assert HTTP == 'HTTP', repr(HTTP)
-        assert self.version in ('1.0', '1.1'), repr(self.version)

    def __iter__(self):
        yield self.status_line
@@ -235,9 +239,13 @@ class TestCase(greentest.TestCase):
    validator = staticmethod(validator)
    application = None

-    def init_server(self, application):
+    def init_logger(self):
        import logging
        logger = logging.getLogger('gevent.pywsgi')
+        return logger
+
+    def init_server(self, application):
+        logger = self.logger = self.init_logger()
        self.server = pywsgi.WSGIServer(('127.0.0.1', 0), application,
                                        log=logger, error_log=logger)

@@ -480,7 +488,7 @@ class TestGetArg(TestCase):
        body = env['wsgi.input'].read(3)
        if PY3:
            body = body.decode('ascii')
-        a = cgi.parse_qs(body).get('a', [1])[0]
+        a = parse_qs(body).get('a', [1])[0]
        start_response('200 OK', [('Content-Type', 'text/plain')])
        return [('a is %s, body is %s' % (a, body)).encode('ascii')]

@@ -738,6 +746,49 @@ Connection: close
        read_http(sock.makefile(), reason='PASSED', chunks=False, body='', content_length=0)


+class TestNonLatin1HeaderFromApplication(TestCase):
+    error_fatal = False # Allow sending the exception response, don't kill the greenlet
+
+    validator = None # Don't validate the application, it's deliberately bad
+    header = b'\xe1\xbd\x8a3' # bomb in utf-8 bytes
+    should_error = PY3 # non-native string under Py3
+
+    def init_server(self, app):
+        TestCase.init_server(self, app)
+        self.errors = list()
+
+    def application(self, environ, start_response):
+        # We return a header that cannot be encoded in latin-1
+        try:
+            start_response("200 PASSED",
+                           [('Content-Type', 'text/plain'),
+                            ('Custom-Header', self.header)])
+        except:
+            self.errors.append(sys.exc_info()[:2])
+            raise
+        return []
+
+    def test(self):
+        sock = self.connect()
+        sock.sendall(b'''GET / HTTP/1.1\r\n\r\n''')
+        if self.should_error:
+            read_http(sock.makefile(), code=500, reason='Internal Server Error')
+            self.assertEqual(len(self.errors), 1)
+            t, v = self.errors[0]
+            self.assertTrue(isinstance(v, UnicodeError))
+        else:
+            read_http(sock.makefile(), code=200, reason='PASSED')
+            self.assertEqual(len(self.errors), 0)
+
+
+class TestNonLatin1UnicodeHeaderFromApplication(TestNonLatin1HeaderFromApplication):
+    # Flip-flop of the superclass: Python 3 native string, Python 2 unicode object
+    header = u"\u1f4a3" # bomb in unicode
+    # Error both on py3 and py2. On py2, non-native string. On py3, native string
+    # that cannot be encoded to latin-1
+    should_error = True
+
+
 class TestInputReadline(TestCase):
    # this test relies on the fact that readline() returns '' after it reached EOF
    # this behaviour is not mandated by WSGI spec, it's just happens that gevent.wsgi behaves like that