Commit a446a4b5 authored by Stefan Behnel's avatar Stefan Behnel

Support parsing of named Unicode escapes outside of the BMP in CPythons...

Support parsing of named Unicode escapes outside of the BMP in CPythons versions with a 2-byte Unicode representation.
Closes #1693.
parent 429e8ecf
...@@ -9,16 +9,16 @@ from __future__ import absolute_import ...@@ -9,16 +9,16 @@ from __future__ import absolute_import
import cython import cython
cython.declare(Nodes=object, ExprNodes=object, EncodedString=object, cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
bytes_literal=object, StringEncoding=object, bytes_literal=object, StringEncoding=object,
FileSourceDescriptor=object, lookup_unicodechar=object, FileSourceDescriptor=object, lookup_unicodechar=object, unicode_category=object,
Future=object, Options=object, error=object, warning=object, Future=object, Options=object, error=object, warning=object,
Builtin=object, ModuleNode=object, Utils=object, Builtin=object, ModuleNode=object, Utils=object,
re=object, _parse_escape_sequences=object, _unicode=object, _bytes=object, re=object, sys=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint) partial=object, reduce=object, _IS_PY3=cython.bint, _IS_2BYTE_UNICODE=cython.bint)
from io import StringIO from io import StringIO
import re import re
import sys import sys
from unicodedata import lookup as lookup_unicodechar from unicodedata import lookup as lookup_unicodechar, category as unicode_category
from functools import partial, reduce from functools import partial, reduce
from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
...@@ -34,6 +34,7 @@ from . import Future ...@@ -34,6 +34,7 @@ from . import Future
from . import Options from . import Options
_IS_PY3 = sys.version_info[0] >= 3 _IS_PY3 = sys.version_info[0] >= 3
_IS_2BYTE_UNICODE = sys.maxunicode == 0xffff
class Ctx(object): class Ctx(object):
...@@ -974,11 +975,21 @@ def _append_escape_sequence(kind, builder, escape_sequence, s): ...@@ -974,11 +975,21 @@ def _append_escape_sequence(kind, builder, escape_sequence, s):
elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...} elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1 chrval = -1
if c == u'N': if c == u'N':
uchar = None
try: try:
chrval = ord(lookup_unicodechar(escape_sequence[3:-1])) uchar = lookup_unicodechar(escape_sequence[3:-1])
chrval = ord(uchar)
except KeyError: except KeyError:
s.error("Unknown Unicode character name %s" % s.error("Unknown Unicode character name %s" %
repr(escape_sequence[3:-1]).lstrip('u'), fatal=False) repr(escape_sequence[3:-1]).lstrip('u'), fatal=False)
except TypeError:
# 2-byte unicode build of CPython?
if (uchar is not None and _IS_2BYTE_UNICODE and len(uchar) == 2 and
unicode_category(uchar[0]) == 'Cs' and unicode_category(uchar[1]) == 'Cs'):
# surrogate pair instead of single character
chrval = 0x10000 + (ord(uchar[0]) - 0xd800) >> 10 + (ord(uchar[1]) - 0xdc00)
else:
raise
elif len(escape_sequence) in (6, 10): elif len(escape_sequence) in (6, 10):
chrval = int(escape_sequence[2:], 16) chrval = int(escape_sequence[2:], 16)
if chrval > 1114111: # sys.maxunicode: if chrval > 1114111: # sys.maxunicode:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment