Support parsing of named Unicode escapes outside of the BMP in CPythons...

Support parsing of named Unicode escapes outside of the BMP in CPythons versions with a 2-byte Unicode representation. Closes #1693.

Support parsing of named Unicode escapes outside of the BMP in CPythons...
Support parsing of named Unicode escapes outside of the BMP in CPythons versions with a 2-byte Unicode representation. Closes #1693.
a446a4b5 · Stefan Behnel · 429e8ecf · a446a4b5
Commit a446a4b5 authored May 31, 2017 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 5 deletions

Cython/Compiler/Parsing.py Cython/Compiler/Parsing.py +16 -5

No files found.
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -9,16 +9,16 @@ from __future__ import absolute_import
 import cython
 cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
               bytes_literal=object, StringEncoding=object,
-               FileSourceDescriptor=object, lookup_unicodechar=object,
+               FileSourceDescriptor=object, lookup_unicodechar=object, unicode_category=object,
               Future=object, Options=object, error=object, warning=object,
               Builtin=object, ModuleNode=object, Utils=object,
-               re=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
+               re=object, sys=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
-               partial=object, reduce=object, _IS_PY3=cython.bint)
+               partial=object, reduce=object, _IS_PY3=cython.bint, _IS_2BYTE_UNICODE=cython.bint)
 from io import StringIO
 import re
 import sys
-from unicodedata import lookup as lookup_unicodechar
+from unicodedata import lookup as lookup_unicodechar, category as unicode_category
 from functools import partial, reduce
 from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
@@ -34,6 +34,7 @@ from . import Future
 from . import Options
 _IS_PY3 = sys.version_info[0] >= 3
+_IS_2BYTE_UNICODE = sys.maxunicode == 0xffff
 class Ctx(object):
@@ -974,11 +975,21 @@ def _append_escape_sequence(kind, builder, escape_sequence, s):
    elif c in u'NUu' and kind in ('u', 'f', ''):  # \uxxxx, \Uxxxxxxxx, \N{...}
        chrval = -1
        if c == u'N':
+            uchar = None
            try:
-                chrval = ord(lookup_unicodechar(escape_sequence[3:-1]))
+                uchar = lookup_unicodechar(escape_sequence[3:-1])
+                chrval = ord(uchar)
            except KeyError:
                s.error("Unknown Unicode character name %s" %
                        repr(escape_sequence[3:-1]).lstrip('u'), fatal=False)
+            except TypeError:
+                # 2-byte unicode build of CPython?
+                if (uchar is not None and _IS_2BYTE_UNICODE and len(uchar) == 2 and
+                        unicode_category(uchar[0]) == 'Cs' and unicode_category(uchar[1]) == 'Cs'):
+                    # surrogate pair instead of single character
+                    chrval = 0x10000 + (ord(uchar[0]) - 0xd800) >> 10 + (ord(uchar[1]) - 0xdc00)
+                else:
+                    raise
        elif len(escape_sequence) in (6, 10):
            chrval = int(escape_sequence[2:], 16)
            if chrval > 1114111:  # sys.maxunicode: