Commit ca8763a2 authored by da-woods's avatar da-woods Committed by Stefan Behnel

Handle normalization of unicode identifiers (GH-3096)

parent 00c1dc96
...@@ -85,7 +85,7 @@ def make_lexicon(): ...@@ -85,7 +85,7 @@ def make_lexicon():
comment = Str("#") + Rep(AnyBut("\n")) comment = Str("#") + Rep(AnyBut("\n"))
return Lexicon([ return Lexicon([
(name, IDENT), (name, Method('normalize_ident')),
(intliteral, Method('strip_underscores', symbol='INT')), (intliteral, Method('strip_underscores', symbol='INT')),
(fltconst, Method('strip_underscores', symbol='FLOAT')), (fltconst, Method('strip_underscores', symbol='FLOAT')),
(imagconst, Method('strip_underscores', symbol='IMAG')), (imagconst, Method('strip_underscores', symbol='IMAG')),
......
...@@ -12,6 +12,7 @@ cython.declare(make_lexicon=object, lexicon=object, ...@@ -12,6 +12,7 @@ cython.declare(make_lexicon=object, lexicon=object,
import os import os
import platform import platform
from unicodedata import normalize
from .. import Utils from .. import Utils
from ..Plex.Scanners import Scanner from ..Plex.Scanners import Scanner
...@@ -341,6 +342,13 @@ class PyrexScanner(Scanner): ...@@ -341,6 +342,13 @@ class PyrexScanner(Scanner):
self.sy = '' self.sy = ''
self.next() self.next()
def normalize_ident(self, text):
try:
text.encode('ascii') # really just name.isascii but supports Python 2 and 3
except UnicodeEncodeError:
text = normalize('NFKC', text)
self.produce(IDENT, text)
def commentline(self, text): def commentline(self, text):
if self.parse_comments: if self.parse_comments:
self.produce('commentline', text) self.produce('commentline', text)
......
...@@ -216,7 +216,7 @@ def decode_filename(filename): ...@@ -216,7 +216,7 @@ def decode_filename(filename):
_match_file_encoding = re.compile(br"(\w*coding)[:=]\s*([-\w.]+)").search _match_file_encoding = re.compile(br"(\w*coding)[:=]\s*([-\w.]+)").search
def detect_opened_file_encoding(f): def detect_opened_file_encoding(f, default='UTF-8'):
# PEPs 263 and 3120 # PEPs 263 and 3120
# Most of the time the first two lines fall in the first couple of hundred chars, # Most of the time the first two lines fall in the first couple of hundred chars,
# and this bulk read/split is much faster. # and this bulk read/split is much faster.
...@@ -236,7 +236,7 @@ def detect_opened_file_encoding(f): ...@@ -236,7 +236,7 @@ def detect_opened_file_encoding(f):
m = _match_file_encoding(lines[1]) m = _match_file_encoding(lines[1])
if m: if m:
return m.group(2).decode('iso8859-1') return m.group(2).decode('iso8859-1')
return "UTF-8" return default
def skip_bom(f): def skip_bom(f):
......
...@@ -545,9 +545,14 @@ class build_ext(_build_ext): ...@@ -545,9 +545,14 @@ class build_ext(_build_ext):
class ErrorWriter(object): class ErrorWriter(object):
match_error = re.compile(r'(warning:)?(?:.*:)?\s*([-0-9]+)\s*:\s*([-0-9]+)\s*:\s*(.*)').match match_error = re.compile(r'(warning:)?(?:.*:)?\s*([-0-9]+)\s*:\s*([-0-9]+)\s*:\s*(.*)').match
def __init__(self): def __init__(self, encoding=None):
self.output = [] self.output = []
self.write = self.output.append self.encoding = encoding
def write(self, value):
if self.encoding:
value = value.encode('ISO-8859-1').decode(self.encoding)
self.output.append(value)
def _collect(self): def _collect(self):
s = ''.join(self.output) s = ''.join(self.output)
...@@ -1002,6 +1007,13 @@ class CythonCompileTestCase(unittest.TestCase): ...@@ -1002,6 +1007,13 @@ class CythonCompileTestCase(unittest.TestCase):
def split_source_and_output(self, test_directory, module, workdir): def split_source_and_output(self, test_directory, module, workdir):
source_file = self.find_module_source_file(os.path.join(test_directory, module) + '.pyx') source_file = self.find_module_source_file(os.path.join(test_directory, module) + '.pyx')
from Cython.Utils import detect_opened_file_encoding
with io_open(source_file, 'rb') as f:
# encoding is passed to ErrorWriter but not used on the source
# since it is sometimes deliberately wrong
encoding = detect_opened_file_encoding(f, default=None)
with io_open(source_file, 'r', encoding='ISO-8859-1') as source_and_output: with io_open(source_file, 'r', encoding='ISO-8859-1') as source_and_output:
error_writer = warnings_writer = None error_writer = warnings_writer = None
out = io_open(os.path.join(workdir, module + os.path.splitext(source_file)[1]), out = io_open(os.path.join(workdir, module + os.path.splitext(source_file)[1]),
...@@ -1010,10 +1022,10 @@ class CythonCompileTestCase(unittest.TestCase): ...@@ -1010,10 +1022,10 @@ class CythonCompileTestCase(unittest.TestCase):
for line in source_and_output: for line in source_and_output:
if line.startswith("_ERRORS"): if line.startswith("_ERRORS"):
out.close() out.close()
out = error_writer = ErrorWriter() out = error_writer = ErrorWriter(encoding=encoding)
elif line.startswith("_WARNINGS"): elif line.startswith("_WARNINGS"):
out.close() out.close()
out = warnings_writer = ErrorWriter() out = warnings_writer = ErrorWriter(encoding=encoding)
else: else:
out.write(line) out.write(line)
finally: finally:
......
# -*- coding: utf-8 -*-
# mode: error
1 = 5 # invalid start symbol
_ERRORS = u"""
4:0: Unrecognized character
"""
# -*- coding: utf-8 -*-
# mode: error
class MyClass: # invalid continue symbol
pass
_ERRORS = u"""
4:13: Unrecognized character
"""
# -*- coding: utf-8 -*-
# mode: error
def f():
a = 1
́b = 2 # looks like an identation error but is actually a combining accent as the first letter of column 4
c = 3
_ERRORS = u"""
6:4: Unrecognized character
"""
# -*- coding: utf-8 -*-
# mode: error
cdef class C:
# these two symbols "\u1e69" and "\u1e9b\u0323" normalize to the same thing
# so the two attributes can't coexist
cdef int omething
cdef double ẛ̣omething
_ERRORS = u"""
7:13: Previous declaration is here
8:16: 'ṩomething' redeclared
"""
...@@ -49,6 +49,12 @@ if sys.version_info[0]>2: ...@@ -49,6 +49,12 @@ if sys.version_info[0]>2:
10 10
>>> NormalClassΓΓ().εxciting_function(None).__qualname__ >>> NormalClassΓΓ().εxciting_function(None).__qualname__
'NormalClassΓΓ.εxciting_function.<locals>.nestεd' 'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
Do kwargs work?
>>> unicode_kwarg(αrg=5)
5
>>> unicode_kwarg_from_cy()
1
""" """
else: else:
__doc__ = "" __doc__ = ""
...@@ -184,6 +190,28 @@ class NormalClassΓΓ(Γναμε2): ...@@ -184,6 +190,28 @@ class NormalClassΓΓ(Γναμε2):
pass pass
return nestεd return nestεd
def unicode_kwarg(*,αrg):
return αrg
def unicode_kwarg_from_cy():
return unicode_kwarg(αrg=1)
cdef class NormalizeAttrCdef:
"""Python normalizes identifier names before they are used;
therefore and fi should access the same attribute.
A more comprehensive version of this is in "unicode_identifiers_normalize.py"
comparing the behaviour to Python. The version here shows it
behaves the same in a cdef class and is tested with Python 2
>>> NormalizeAttrCdef().get()
5
"""
cdef int fi # note unicode ligature symbol
def __init__(self):
self.fi = 5
def get(self):
return self.fi
if sys.version_info[0]<=2: if sys.version_info[0]<=2:
# These symbols are causing problems for doctest # These symbols are causing problems for doctest
del NormalClassΓΓ del NormalClassΓΓ
......
# -*- coding: utf-8 -*-
# mode: run
# tag: pure3.0, pep3131
PYTHON build_tests.py
# show behaviour in Python mode
PYTHON -m doctest test0.py
PYTHON -m doctest test1.py
PYTHON -m doctest test2.py
PYTHON setup.py build_ext --inplace
# test in Cython mode
PYTHON -c "import doctest; import test0 as m; exit(doctest.testmod(m)[0])"
PYTHON -c "import doctest; import test1 as m; exit(doctest.testmod(m)[0])"
PYTHON -c "import doctest; import test2 as m; exit(doctest.testmod(m)[0])"
########## setup.py #########
from Cython.Build.Dependencies import cythonize
from distutils.core import setup
setup(
ext_modules = cythonize("test*.py"),
)
######### build_tests.py ########
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import unicodedata
# a few pairs of unicode strings that should be equivalent after normalization
string_pairs = [("fi", "fi"), # ligature and two letters
("a\u0301", '\u00e1'), # a with acute accent with combining character or as 1 character
("α\u0334\u0362", "α\u0362\u0334") # alpha with a pair of combining characters
# in a different order. No single character to normalize to
]
# Show that the pairs genuinely aren't equal before normalization
for sp in string_pairs:
assert sp[0] != sp[1]
assert unicodedata.normalize('NFKC', sp[0]) == unicodedata.normalize('NFKC', sp[1])
# some code that accesses the identifiers through the two different names
# contains doctests
example_code = [
"""
class C:
'''
>>> C().get()
True
'''
def __init__(self):
self.{0} = True
def get(self):
return self.{1}
""", """
def pass_through({0}):
'''
>>> pass_through(True)
True
'''
return {1}
""", """
import cython
{0} = True
def test():
'''
>>> test()
True
'''
return {1}
"""]
for idx in range(len(example_code)):
with open("test{0}.py".format(idx),"w") as f:
if sys.version_info[0] > 2:
f.write("# -*- coding: utf-8 -*-\n")
f.write(example_code[idx].format(*string_pairs[idx]))
else:
f.write("\n") # code isn't Python 2 compatible - write a dummy file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment