support surrogates in unicode string literals in Py3.3

ac90a80a · Stefan Behnel · cd98dbdb · ac90a80a · ac90a80a · ac90a80a
Commit ac90a80a authored Mar 15, 2013 by Stefan Behnel
Showing with 58 additions and 25 deletions

Cython/Compiler/ExprNodes.py Cython/Compiler/ExprNodes.py +23 -24

Cython/Compiler/StringEncoding.py Cython/Compiler/StringEncoding.py +21 -1

tests/run/unicodeliterals.pyx tests/run/unicodeliterals.pyx +14 -0

No files found.
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1187,7 +1187,7 @@ class UnicodeNode(ConstNode):
        self.constant_result = self.value

    def as_sliced_node(self, start, stop, step=None):
-        if _string_contains_surrogates(self.value[:stop]):
+        if StringEncoding.string_contains_surrogates(self.value[:stop]):
            # this is unsafe as it may give different results in different runtimes
            return None
        value = StringEncoding.EncodedString(self.value[start:stop:step])
@@ -1236,11 +1236,30 @@ class UnicodeNode(ConstNode):
        return BoolNode(self.pos, value=bool_value, constant_result=bool_value)

    def contains_surrogates(self):
-        return _string_contains_surrogates(self.value)
+        return StringEncoding.string_contains_surrogates(self.value)

    def generate_evaluation_code(self, code):
        if self.type.is_pyobject:
-            self.result_code = code.get_py_string_const(self.value)
+            if self.contains_surrogates():
+                # surrogates are not really portable and cannot be
+                # decoded by the UTF-8 codec in Py3.3
+                self.result_code = code.get_py_const(py_object_type, 'ustring_')
+                data_cname = code.get_pyunicode_ptr_const(self.value)
+                code = code.get_cached_constants_writer()
+                code.mark_pos(self.pos)
+                code.putln(
+                    "%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % (
+                        self.result_code,
+                        data_cname,
+                        data_cname,
+                        code.error_goto_if_null(self.result_code, self.pos)))
+                code.putln("#if CYTHON_PEP393_ENABLED")
+                code.putln(
+                    code.error_goto_if_neg(
+                        "PyUnicode_READY(%s)" % self.result_code, self.pos))
+                code.putln("#endif")
+            else:
+                self.result_code = code.get_py_string_const(self.value)
        else:
            self.result_code = code.get_pyunicode_ptr_const(self.value)

@@ -1271,7 +1290,7 @@ class StringNode(PyConstNode):
        value = type(self.value)(self.value[start:stop:step])
        value.encoding = self.value.encoding
        if self.unicode_value is not None:
-            if _string_contains_surrogates(self.unicode_value[:stop]):
+            if StringEncoding.string_contains_surrogates(self.unicode_value[:stop]):
                # this is unsafe as it may give different results in different runtimes
                return None
            unicode_value = StringEncoding.EncodedString(
@@ -1316,26 +1335,6 @@ class IdentifierStringNode(StringNode):
    is_identifier = True


-def _string_contains_surrogates(ustring):
-    """
-    Check if the unicode string contains surrogate code points
-    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
-    Unicode, i.e. characters that would be spelled as two
-    separate code units on a narrow platform.
-    """
-    for c in map(ord, ustring):
-        if c > 65535: # can only happen on wide platforms
-            return True
-            # We only look for the first code unit (D800-DBFF) of a
-        # surrogate pair - if we find one, the other one
-        # (DC00-DFFF) is likely there, too.  If we don't find it,
-        # any second code unit cannot make for a surrogate pair by
-        # itself.
-        if 0xD800 <= c <= 0xDBFF:
-            return True
-    return False
-
-
 class ImagNode(AtomicExprNode):
    #  Imaginary number literal
    #

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -126,9 +126,28 @@ class EncodedString(_unicode):
        assert self.encoding is None
        return self.encode("UTF-8")

+    @property
    def is_unicode(self):
        return self.encoding is None
-    is_unicode = property(is_unicode)
+
+    def contains_surrogates(self):
+        return string_contains_surrogates(self)
+
+
+def string_contains_surrogates(ustring):
+    """
+    Check if the unicode string contains surrogate code points
+    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+    Unicode, i.e. characters that would be spelled as two
+    separate code units on a narrow platform.
+    """
+    for c in map(ord, ustring):
+        if c > 65535:  # can only happen on wide platforms
+            return True
+        if 0xD800 <= c <= 0xDFFF:
+            return True
+    return False
+

 class BytesLiteral(_bytes):
    # bytes subclass that is compatible with EncodedString
@@ -155,6 +174,7 @@ class BytesLiteral(_bytes):

    is_unicode = False

+
 char_from_escape_sequence = {
    r'\a' : u'\a',
    r'\b' : u'\b',

--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -17,6 +17,10 @@ __doc__ = br"""
    u'\x03g\xf8\uf8d2S\xf8k ik'
    >>> f
    u'\xf8'
+    >>> g
+    u'\udc00'
+    >>> h
+    u'\ud800'
    >>> add
    u'S\xf8k ik\xfc\xd6\xe4abc'
    >>> null
@@ -36,6 +40,10 @@ __doc__ = br"""
    10
    >>> len(f)
    1
+    >>> len(g)
+    1
+    >>> len(h)
+    1
    >>> len(add)
    12
    >>> len(null)
@@ -63,6 +71,10 @@ __doc__ = br"""
    True
    >>> f == u'\\xf8' # unescaped by Python
    True
+    >>> g == u'\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
+    True
    >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
    True
    >>> add == u'Søk ik' + u'üÖä' + 'abc'
@@ -95,6 +107,8 @@ c = u'Søk ik'
 d = u'üÖä'
 e = u'\x03\x67\xf8\uf8d2Søk ik'
 f = u'\xf8'
+g = u'\udc00'   # lone trail surrogate
+h = u'\ud800'   # lone lead surrogate
 k = u'\N{SNOWMAN}'

 add = u'Søk ik' + u'üÖä' + u'abc'