Commit 7540f7e9 authored by scoder's avatar scoder Committed by Stefan Behnel

Really only use PyUnicode_FromUnicode() when needed (GH-3697)

* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms.

See https://github.com/cython/cython/issues/3678

* Extend buildenv test to debug a MacOS problem.
* Add a test for surrogate pairs in Unicode strings.
* Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates.
* Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.
parent b145f4f9
...@@ -1623,8 +1623,13 @@ class UnicodeNode(ConstNode): ...@@ -1623,8 +1623,13 @@ class UnicodeNode(ConstNode):
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
if self.type.is_pyobject: if self.type.is_pyobject:
if self.contains_surrogates(): # FIXME: this should go away entirely!
# surrogates are not really portable and cannot be # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
# Py2 can generate different code from Py3 here. Let's hope we get away with claiming that
# the processing of surrogate pairs in code was always ambiguous and lead to different results
# on P16/32bit Unicode platforms.
if StringEncoding.string_contains_lone_surrogates(self.value):
# lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3 # decoded by the UTF-8 codec in Py3.3
self.result_code = code.get_py_const(py_object_type, 'ustring') self.result_code = code.get_py_const(py_object_type, 'ustring')
data_cname = code.get_pyunicode_ptr_const(self.value) data_cname = code.get_pyunicode_ptr_const(self.value)
......
...@@ -154,6 +154,34 @@ def string_contains_surrogates(ustring): ...@@ -154,6 +154,34 @@ def string_contains_surrogates(ustring):
return False return False
def string_contains_lone_surrogates(ustring):
"""
Check if the unicode string contains lone surrogate code points
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
Unicode, i.e. characters that would be spelled as two
separate code units on a narrow platform, but that do not form a pair.
"""
last_was_start = False
unicode_uses_surrogate_encoding = sys.maxunicode == 65535
for c in map(ord, ustring):
# surrogates tend to be rare
if c < 0xD800 or c > 0xDFFF:
if last_was_start:
return True
elif not unicode_uses_surrogate_encoding:
# on 32bit Unicode platforms, there is never a pair
return True
elif c <= 0xDBFF:
if last_was_start:
return True # lone start
last_was_start = True
else:
if not last_was_start:
return True # lone end
last_was_start = False
return last_was_start
class BytesLiteral(_bytes): class BytesLiteral(_bytes):
# bytes subclass that is compatible with EncodedString # bytes subclass that is compatible with EncodedString
encoding = None encoding = None
......
# -*- coding: utf-8 -*-
import sys
import unittest
import Cython.Compiler.StringEncoding as StringEncoding
class StringEncodingTest(unittest.TestCase):
"""
Test the StringEncoding module.
"""
def test_string_contains_lone_surrogates(self):
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
# This behaves differently in Py2 when freshly parsed and read from a .pyc file,
# but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
if sys.version_info[0] != 2:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
# In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
if sys.version_info[0] == 2 and sys.maxunicode == 65565:
self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
else:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
def test_string_contains_surrogates(self):
self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
...@@ -532,7 +532,11 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { ...@@ -532,7 +532,11 @@ static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
#define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u)
#define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i)
#define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch) #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch)
#if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u))) #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
#else
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u))
#endif
#else #else
#define CYTHON_PEP393_ENABLED 0 #define CYTHON_PEP393_ENABLED 0
#define PyUnicode_1BYTE_KIND 1 #define PyUnicode_1BYTE_KIND 1
......
...@@ -66,6 +66,7 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch ...@@ -66,6 +66,7 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
//////////////////// PyUCS4InUnicode //////////////////// //////////////////// PyUCS4InUnicode ////////////////////
#if Py_UNICODE_SIZE == 2
static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
/* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */ /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
Py_UNICODE high_val, low_val; Py_UNICODE high_val, low_val;
...@@ -77,6 +78,7 @@ static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t l ...@@ -77,6 +78,7 @@ static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t l
} }
return 0; return 0;
} }
#endif
static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
Py_UNICODE uchar; Py_UNICODE uchar;
...@@ -101,12 +103,15 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch ...@@ -101,12 +103,15 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
return 0; return 0;
} }
#endif #endif
if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) { #if Py_UNICODE_SIZE == 2
if (unlikely(character > 65535)) {
return __Pyx_PyUnicodeBufferContainsUCS4_SP( return __Pyx_PyUnicodeBufferContainsUCS4_SP(
PyUnicode_AS_UNICODE(unicode), PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode), PyUnicode_GET_SIZE(unicode),
character); character);
} else { } else
#endif
{
return __Pyx_PyUnicodeBufferContainsUCS4_BMP( return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
PyUnicode_AS_UNICODE(unicode), PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode), PyUnicode_GET_SIZE(unicode),
......
...@@ -121,4 +121,12 @@ CFLAGS (distutils) = {config_var('CFLAGS')} ...@@ -121,4 +121,12 @@ CFLAGS (distutils) = {config_var('CFLAGS')}
CFLAGS (env) = {get_env('CFLAGS', '')} CFLAGS (env) = {get_env('CFLAGS', '')}
LINKCC (distutils) = {config_var('LINKCC')} LINKCC (distutils) = {config_var('LINKCC')}
LINKCC (env) = {get_env('LINKCC', '')} LINKCC (env) = {get_env('LINKCC', '')}
Encodings:
sys maxunicode = {sys.maxunicode}
LANG (env) = {get_env('LANG', '')}
PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
sys stdout encoding = {sys.stdout.encoding}
sys default encoding = {sys.getdefaultencoding()}
sys FS encoding = {sys.getfilesystemencoding()}
""") """)
...@@ -21,6 +21,13 @@ __doc__ = br""" ...@@ -21,6 +21,13 @@ __doc__ = br"""
u'\udc00' u'\udc00'
>>> h >>> h
u'\ud800' u'\ud800'
>>> q
u'\udc00\ud800'
# The output of surrogate pairs differs between 16/32bit Unicode runtimes.
#>>> p
#u'\ud800\udc00'
>>> add >>> add
u'S\xf8k ik\xfc\xd6\xe4abc' u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null >>> null
...@@ -44,6 +51,10 @@ __doc__ = br""" ...@@ -44,6 +51,10 @@ __doc__ = br"""
1 1
>>> len(h) >>> len(h)
1 1
>>> len(q)
2
>>> len(q)
2
>>> len(add) >>> len(add)
12 12
>>> len(null) >>> len(null)
...@@ -75,6 +86,10 @@ __doc__ = br""" ...@@ -75,6 +86,10 @@ __doc__ = br"""
True True
>>> h == u'\\ud800' # unescaped by Python (required by doctest) >>> h == u'\\ud800' # unescaped by Python (required by doctest)
True True
>>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
True
>>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603' >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True True
>>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest) >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest)
...@@ -115,6 +130,8 @@ g = u'\udc00' # lone trail surrogate ...@@ -115,6 +130,8 @@ g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}' k = u'\N{SNOWMAN}'
m = ur'abc\xf8\t\u00f8\U000000f8' m = ur'abc\xf8\t\u00f8\U000000f8'
p = u'\ud800\udc00' # surrogate pair
q = u'\udc00\ud800' # reversed surrogate pair
add = u'Søk ik' + u'üÖä' + u'abc' add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00' null = u'\x00'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment