Commit a446a4b5 authored by Stefan Behnel's avatar Stefan Behnel

Support parsing of named Unicode escapes outside of the BMP in CPythons...

Support parsing of named Unicode escapes outside of the BMP in CPythons versions with a 2-byte Unicode representation.
Closes #1693.
parent 429e8ecf
......@@ -9,16 +9,16 @@ from __future__ import absolute_import
import cython
cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
bytes_literal=object, StringEncoding=object,
FileSourceDescriptor=object, lookup_unicodechar=object,
FileSourceDescriptor=object, lookup_unicodechar=object, unicode_category=object,
Future=object, Options=object, error=object, warning=object,
Builtin=object, ModuleNode=object, Utils=object,
re=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint)
re=object, sys=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint, _IS_2BYTE_UNICODE=cython.bint)
from io import StringIO
import re
import sys
from unicodedata import lookup as lookup_unicodechar
from unicodedata import lookup as lookup_unicodechar, category as unicode_category
from functools import partial, reduce
from .Scanning import PyrexScanner, FileSourceDescriptor, StringSourceDescriptor
......@@ -34,6 +34,7 @@ from . import Future
from . import Options
_IS_PY3 = sys.version_info[0] >= 3
_IS_2BYTE_UNICODE = sys.maxunicode == 0xffff
class Ctx(object):
......@@ -974,11 +975,21 @@ def _append_escape_sequence(kind, builder, escape_sequence, s):
elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1
if c == u'N':
uchar = None
try:
chrval = ord(lookup_unicodechar(escape_sequence[3:-1]))
uchar = lookup_unicodechar(escape_sequence[3:-1])
chrval = ord(uchar)
except KeyError:
s.error("Unknown Unicode character name %s" %
repr(escape_sequence[3:-1]).lstrip('u'), fatal=False)
except TypeError:
# 2-byte unicode build of CPython?
if (uchar is not None and _IS_2BYTE_UNICODE and len(uchar) == 2 and
unicode_category(uchar[0]) == 'Cs' and unicode_category(uchar[1]) == 'Cs'):
# surrogate pair instead of single character
chrval = 0x10000 + (ord(uchar[0]) - 0xd800) >> 10 + (ord(uchar[1]) - 0xdc00)
else:
raise
elif len(escape_sequence) in (6, 10):
chrval = int(escape_sequence[2:], 16)
if chrval > 1114111: # sys.maxunicode:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment