Commit 379ea8d7 authored by Stefan Behnel's avatar Stefan Behnel

Rewrite fstring parsing to match CPython 3.6 and the updated PEP 498

- resolve string escapes  only outside of fstring expressions
- reject backslashes inside of fstring expressions
Also make some fstring errors non-fatal to keep parsing.
parent 634c41aa
...@@ -68,10 +68,11 @@ cdef p_opt_string_literal(PyrexScanner s, required_type=*) ...@@ -68,10 +68,11 @@ cdef p_opt_string_literal(PyrexScanner s, required_type=*)
cdef bint check_for_non_ascii_characters(unicode string) cdef bint check_for_non_ascii_characters(unicode string)
@cython.locals(systr=unicode, is_python3_source=bint, is_raw=bint) @cython.locals(systr=unicode, is_python3_source=bint, is_raw=bint)
cdef p_string_literal(PyrexScanner s, kind_override=*) cdef p_string_literal(PyrexScanner s, kind_override=*)
@cython.locals(i=Py_ssize_t, size=Py_ssize_t) cdef _append_escape_sequence(kind, builder, unicode escape_sequence, PyrexScanner s)
cdef list p_f_string(PyrexScanner s, unicode_value, pos) @cython.locals(i=Py_ssize_t, size=Py_ssize_t, c=Py_UCS4)
cdef list p_f_string(PyrexScanner s, unicode unicode_value, pos, bint is_raw)
@cython.locals(i=Py_ssize_t, size=Py_ssize_t, c=Py_UCS4, quote_char=Py_UCS4, NO_CHAR=Py_UCS4) @cython.locals(i=Py_ssize_t, size=Py_ssize_t, c=Py_UCS4, quote_char=Py_UCS4, NO_CHAR=Py_UCS4)
cdef tuple p_f_string_expr(PyrexScanner s, unicode_value, pos, Py_ssize_t starting_index) cdef tuple p_f_string_expr(PyrexScanner s, unicode_value, pos, Py_ssize_t starting_index, bint is_raw)
cdef p_list_maker(PyrexScanner s) cdef p_list_maker(PyrexScanner s)
cdef p_comp_iter(PyrexScanner s, body) cdef p_comp_iter(PyrexScanner s, body)
cdef p_comp_for(PyrexScanner s, body) cdef p_comp_for(PyrexScanner s, body)
......
...@@ -12,7 +12,7 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object, ...@@ -12,7 +12,7 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
FileSourceDescriptor=object, lookup_unicodechar=object, FileSourceDescriptor=object, lookup_unicodechar=object,
Future=object, Options=object, error=object, warning=object, Future=object, Options=object, error=object, warning=object,
Builtin=object, ModuleNode=object, Utils=object, Builtin=object, ModuleNode=object, Utils=object,
re=object, _unicode=object, _bytes=object, re=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
partial=object, reduce=object, _IS_PY3=cython.bint) partial=object, reduce=object, _IS_PY3=cython.bint)
from io import StringIO from io import StringIO
...@@ -811,8 +811,8 @@ def p_cat_string_literal(s): ...@@ -811,8 +811,8 @@ def p_cat_string_literal(s):
if set([kind, next_kind]) in (set(['f', 'u']), set(['f', ''])): if set([kind, next_kind]) in (set(['f', 'u']), set(['f', ''])):
kind = 'f' kind = 'f'
else: else:
error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" % error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" % (
(kind, next_kind)) kind, next_kind))
continue continue
bstrings.append(next_bytes_value) bstrings.append(next_bytes_value)
ustrings.append(next_unicode_value) ustrings.append(next_unicode_value)
...@@ -867,18 +867,17 @@ def p_string_literal(s, kind_override=None): ...@@ -867,18 +867,17 @@ def p_string_literal(s, kind_override=None):
# s.sy == 'BEGIN_STRING' # s.sy == 'BEGIN_STRING'
pos = s.position() pos = s.position()
is_raw = False
is_python3_source = s.context.language_level >= 3 is_python3_source = s.context.language_level >= 3
has_non_ascii_literal_characters = False
kind_string = s.systring.rstrip('"\'').lower() kind_string = s.systring.rstrip('"\'').lower()
if len(set(kind_string)) != len(kind_string): if len(kind_string) > 1:
s.error('Duplicate string prefix character') if len(set(kind_string)) != len(kind_string):
if 'b' in kind_string and 'u' in kind_string: error(pos, 'Duplicate string prefix character')
s.error('String prefixes b and u cannot be combined') if 'b' in kind_string and 'u' in kind_string:
if 'b' in kind_string and 'f' in kind_string: error(pos, 'String prefixes b and u cannot be combined')
s.error('String prefixes b and f cannot be combined') if 'b' in kind_string and 'f' in kind_string:
if 'u' in kind_string and 'f' in kind_string: error(pos, 'String prefixes b and f cannot be combined')
s.error('String prefixes u and f cannot be combined') if 'u' in kind_string and 'f' in kind_string:
error(pos, 'String prefixes u and f cannot be combined')
is_raw = 'r' in kind_string is_raw = 'r' in kind_string
...@@ -886,10 +885,11 @@ def p_string_literal(s, kind_override=None): ...@@ -886,10 +885,11 @@ def p_string_literal(s, kind_override=None):
# this should never happen, since the lexer does not allow combining c # this should never happen, since the lexer does not allow combining c
# with other prefix characters # with other prefix characters
if len(kind_string) != 1: if len(kind_string) != 1:
s.error('Invalid string prefix for character literal') error(pos, 'Invalid string prefix for character literal')
kind = 'c' kind = 'c'
elif 'f' in kind_string: elif 'f' in kind_string:
kind = 'f' # u is ignored kind = 'f' # u is ignored
is_raw = True # postpone the escape resolution
elif 'b' in kind_string: elif 'b' in kind_string:
kind = 'b' kind = 'b'
elif 'u' in kind_string: elif 'u' in kind_string:
...@@ -917,54 +917,11 @@ def p_string_literal(s, kind_override=None): ...@@ -917,54 +917,11 @@ def p_string_literal(s, kind_override=None):
# print "p_string_literal: sy =", sy, repr(s.systring) ### # print "p_string_literal: sy =", sy, repr(s.systring) ###
if sy == 'CHARS': if sy == 'CHARS':
chars.append(systr) chars.append(systr)
if is_python3_source and not has_non_ascii_literal_characters and check_for_non_ascii_characters(systr):
has_non_ascii_literal_characters = True
elif sy == 'ESCAPE': elif sy == 'ESCAPE':
if is_raw: if is_raw:
chars.append(systr) chars.append(systr)
if is_python3_source and not has_non_ascii_literal_characters \
and check_for_non_ascii_characters(systr):
has_non_ascii_literal_characters = True
else: else:
c = systr[1] _append_escape_sequence(kind, chars, systr, s)
if c in u"01234567":
chars.append_charval( int(systr[1:], 8) )
elif c in u"'\"\\":
chars.append(c)
elif c in u"abfnrtv":
chars.append(
StringEncoding.char_from_escape_sequence(systr))
elif c == u'\n':
pass
elif c == u'x': # \xXX
if len(systr) == 4:
chars.append_charval( int(systr[2:], 16) )
else:
s.error("Invalid hex escape '%s'" % systr,
fatal=False)
elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1
if c == u'N':
try:
chrval = ord(lookup_unicodechar(systr[3:-1]))
except KeyError:
s.error("Unknown Unicode character name %s" %
repr(systr[3:-1]).lstrip('u'))
elif len(systr) in (6,10):
chrval = int(systr[2:], 16)
if chrval > 1114111: # sys.maxunicode:
s.error("Invalid unicode escape '%s'" % systr)
chrval = -1
else:
s.error("Invalid unicode escape '%s'" % systr,
fatal=False)
if chrval >= 0:
chars.append_uescape(chrval, systr)
else:
chars.append(u'\\' + systr[1:])
if is_python3_source and not has_non_ascii_literal_characters \
and check_for_non_ascii_characters(systr):
has_non_ascii_literal_characters = True
elif sy == 'NEWLINE': elif sy == 'NEWLINE':
chars.append(u'\n') chars.append(u'\n')
elif sy == 'END_STRING': elif sy == 'END_STRING':
...@@ -972,8 +929,8 @@ def p_string_literal(s, kind_override=None): ...@@ -972,8 +929,8 @@ def p_string_literal(s, kind_override=None):
elif sy == 'EOF': elif sy == 'EOF':
s.error("Unclosed string literal", pos=pos) s.error("Unclosed string literal", pos=pos)
else: else:
s.error("Unexpected token %r:%r in string literal" % s.error("Unexpected token %r:%r in string literal" % (
(sy, s.systring)) sy, s.systring))
if kind == 'c': if kind == 'c':
unicode_value = None unicode_value = None
...@@ -982,50 +939,119 @@ def p_string_literal(s, kind_override=None): ...@@ -982,50 +939,119 @@ def p_string_literal(s, kind_override=None):
error(pos, u"invalid character literal: %r" % bytes_value) error(pos, u"invalid character literal: %r" % bytes_value)
else: else:
bytes_value, unicode_value = chars.getstrings() bytes_value, unicode_value = chars.getstrings()
if is_python3_source and has_non_ascii_literal_characters: if is_python3_source and check_for_non_ascii_characters(unicode_value):
# Python 3 forbids literal non-ASCII characters in byte strings # Python 3 forbids literal non-ASCII characters in byte strings
if kind not in ('u', 'f'): if kind not in ('u', 'f'):
s.error("bytes can only contain ASCII literal characters.", error(pos, "bytes can only contain ASCII literal characters.")
pos=pos, fatal=False)
bytes_value = None bytes_value = None
if kind == 'f': if kind == 'f':
unicode_value = p_f_string(s, unicode_value, pos) unicode_value = p_f_string(s, unicode_value, pos, is_raw='r' in kind_string)
s.next() s.next()
return (kind, bytes_value, unicode_value) return (kind, bytes_value, unicode_value)
def p_f_string(s, unicode_value, pos): def _append_escape_sequence(kind, builder, escape_sequence, s):
c = escape_sequence[1]
if c in u"01234567":
builder.append_charval(int(escape_sequence[1:], 8))
elif c in u"'\"\\":
builder.append(c)
elif c in u"abfnrtv":
builder.append(StringEncoding.char_from_escape_sequence(escape_sequence))
elif c == u'\n':
pass # line continuation
elif c == u'x': # \xXX
if len(escape_sequence) == 4:
builder.append_charval(int(escape_sequence[2:], 16))
else:
s.error("Invalid hex escape '%s'" % escape_sequence, fatal=False)
elif c in u'NUu' and kind in ('u', 'f', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1
if c == u'N':
try:
chrval = ord(lookup_unicodechar(escape_sequence[3:-1]))
except KeyError:
s.error("Unknown Unicode character name %s" %
repr(escape_sequence[3:-1]).lstrip('u'), fatal=False)
elif len(escape_sequence) in (6, 10):
chrval = int(escape_sequence[2:], 16)
if chrval > 1114111: # sys.maxunicode:
s.error("Invalid unicode escape '%s'" % escape_sequence)
chrval = -1
else:
s.error("Invalid unicode escape '%s'" % escape_sequence, fatal=False)
if chrval >= 0:
builder.append_uescape(chrval, escape_sequence)
else:
builder.append(escape_sequence)
_parse_escape_sequences = re.compile(
# escape sequences:
br'(\\(?:'
br'[\\abfnrtv"\'{]|'
br'[0-7]{2,3}|'
br'N\{[^}]*\}|'
br'x[0-9a-fA-F]{2}|'
br'u[0-9a-fA-F]{4}|'
br'U[0-9a-fA-F]{8}|'
br'[NuU]|' # detect invalid escape sequences that do not match above
br')?|'
# non-escape sequences:
br'\{\{?|'
br'\}\}?|'
br'[^\\{}]+)'.decode('us-ascii')
).match
def p_f_string(s, unicode_value, pos, is_raw):
# Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes # Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes
# or FormattedValueNodes. # or FormattedValueNodes.
values = [] values = []
i = 0 next_start = 0
size = len(unicode_value) size = len(unicode_value)
current_literal_start = 0 builder = StringEncoding.UnicodeLiteralBuilder()
while i < size:
c = unicode_value[i] while next_start < size:
if c in '{}': end = next_start
if i + 1 < size and unicode_value[i + 1] == c: match = _parse_escape_sequences(unicode_value, next_start)
encoded_str = EncodedString(unicode_value[current_literal_start:i + 1]) if match is None:
values.append(ExprNodes.UnicodeNode(pos, value=encoded_str)) error_pos = (pos[0], pos[1] + end, pos[2]) # FIXME: handle newlines in string
i += 2 error(error_pos, "Invalid escape sequence")
current_literal_start = i
elif c == '}': next_start = match.end()
s.error("single '}' encountered in format string") part = match.group()
c = part[0]
if c == '\\':
if not is_raw and len(part) > 1:
_append_escape_sequence('f', builder, part, s)
else: else:
encoded_str = EncodedString(unicode_value[current_literal_start:i]) builder.append(part)
values.append(ExprNodes.UnicodeNode(pos, value=encoded_str)) elif c == '{':
i, expr_node = p_f_string_expr(s, unicode_value, pos, i + 1) if part == '{{':
current_literal_start = i builder.append('{')
else:
# start of an expression
if builder.chars:
values.append(ExprNodes.UnicodeNode(pos, value=builder.getstring()))
builder = StringEncoding.UnicodeLiteralBuilder()
next_start, expr_node = p_f_string_expr(s, unicode_value, pos, next_start, is_raw)
values.append(expr_node) values.append(expr_node)
elif c == '}':
if part == '}}':
builder.append('}')
else:
error_pos = (pos[0], pos[1] + end, pos[2]) # FIXME: handle newlines in string
s.error("f-string: single '}' is not allowed", pos=error_pos)
else: else:
i += 1 builder.append(part)
encoded_str = EncodedString(unicode_value[current_literal_start:]) if builder.chars:
values.append(ExprNodes.UnicodeNode(pos, value=encoded_str)) values.append(ExprNodes.UnicodeNode(pos, value=builder.getstring()))
return values return values
def p_f_string_expr(s, unicode_value, pos, starting_index): def p_f_string_expr(s, unicode_value, pos, starting_index, is_raw):
# Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode # Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode
# and the index in the string that follows the expression. # and the index in the string that follows the expression.
i = starting_index i = starting_index
...@@ -1045,7 +1071,8 @@ def p_f_string_expr(s, unicode_value, pos, starting_index): ...@@ -1045,7 +1071,8 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
if quote_char != NO_CHAR: if quote_char != NO_CHAR:
if c == '\\': if c == '\\':
i += 1 error_pos = (pos[0], pos[1] + i, pos[2]) # FIXME: handle newlines in string
error(error_pos, "backslashes not allowed in f-strings")
elif c == quote_char: elif c == quote_char:
if in_triple_quotes: if in_triple_quotes:
if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c: if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
...@@ -1080,15 +1107,16 @@ def p_f_string_expr(s, unicode_value, pos, starting_index): ...@@ -1080,15 +1107,16 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
expr_pos = (pos[0], pos[1], pos[2] + starting_index + 2) # TODO: find exact code position (concat, multi-line, ...) expr_pos = (pos[0], pos[1], pos[2] + starting_index + 2) # TODO: find exact code position (concat, multi-line, ...)
if not expr_str.strip(): if not expr_str.strip():
s.error("empty expression not allowed in f-string") error(pos, "empty expression not allowed in f-string")
if terminal_char == '!': if terminal_char == '!':
i += 1 i += 1
if i + 2 > size: if i + 2 > size:
s.error("invalid conversion char at end of string") error(pos, "invalid conversion char at end of string")
conversion_char = unicode_value[i] else:
i += 1 conversion_char = unicode_value[i]
terminal_char = unicode_value[i] i += 1
terminal_char = unicode_value[i]
if terminal_char == ':': if terminal_char == ':':
in_triple_quotes = False in_triple_quotes = False
...@@ -1128,14 +1156,14 @@ def p_f_string_expr(s, unicode_value, pos, starting_index): ...@@ -1128,14 +1156,14 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
# validate the conversion char # validate the conversion char
if conversion_char is not None and not ExprNodes.FormattedValueNode.find_conversion_func(conversion_char): if conversion_char is not None and not ExprNodes.FormattedValueNode.find_conversion_func(conversion_char):
s.error("invalid conversion character '%s'" % conversion_char) error(pos, "invalid conversion character '%s'" % conversion_char)
# the format spec is itself treated like an f-string # the format spec is itself treated like an f-string
if format_spec_str: if format_spec_str:
format_spec = ExprNodes.JoinedStrNode(pos, values=p_f_string(s, format_spec_str, pos)) format_spec = ExprNodes.JoinedStrNode(pos, values=p_f_string(s, format_spec_str, pos, is_raw))
return i + 1, ExprNodes.FormattedValueNode( return i + 1, ExprNodes.FormattedValueNode(
s.position(), value=expr, conversion_char=conversion_char, format_spec=format_spec) pos, value=expr, conversion_char=conversion_char, format_spec=format_spec)
# since PEP 448: # since PEP 448:
......
...@@ -18,6 +18,15 @@ max_long = LONG_MAX ...@@ -18,6 +18,15 @@ max_long = LONG_MAX
min_long = LONG_MIN min_long = LONG_MIN
def escaping():
"""
>>> escaping()
"""
assert f'{{{{{"abc"}}}}}{{}}{{' == '{{abc}}{}{'
assert f'\x7b}}' == '{}'
assert f'{"{{}}"}' == '{{}}'
def format2(ab, cd): def format2(ab, cd):
""" """
>>> a, b, c = format2(1, 2) >>> a, b, c = format2(1, 2)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment