Rewrite fstring parsing to match CPython 3.6 and the updated PEP 498

- resolve string escapes only outside of fstring expressions - reject backslashes inside of fstring expressions Also make some fstring errors non-fatal to keep parsing.

Rewrite fstring parsing to match CPython 3.6 and the updated PEP 498
- resolve string escapes only outside of fstring expressions - reject backslashes inside of fstring expressions Also make some fstring errors non-fatal to keep parsing.
379ea8d7 · Stefan Behnel · 634c41aa · 379ea8d7 · 379ea8d7 · 379ea8d7
Commit 379ea8d7 authored Feb 11, 2017 by Stefan Behnel
Showing with 136 additions and 98 deletions

Cython/Compiler/Parsing.pxd Cython/Compiler/Parsing.pxd +4 -3

Cython/Compiler/Parsing.py Cython/Compiler/Parsing.py +123 -95

tests/run/fstring.pyx tests/run/fstring.pyx +9 -0

No files found.
--- a/Cython/Compiler/Parsing.pxd
+++ b/Cython/Compiler/Parsing.pxd
@@ -68,10 +68,11 @@ cdef p_opt_string_literal(PyrexScanner s, required_type=*)
 cdef bint check_for_non_ascii_characters(unicode string)
 @cython.locals(systr=unicode, is_python3_source=bint, is_raw=bint)
 cdef p_string_literal(PyrexScanner s, kind_override=*)
-@cython.locals(i=Py_ssize_t, size=Py_ssize_t)
+cdef _append_escape_sequence(kind, builder, unicode escape_sequence, PyrexScanner s)
-cdef list p_f_string(PyrexScanner s, unicode_value, pos)
+@cython.locals(i=Py_ssize_t, size=Py_ssize_t, c=Py_UCS4)
+cdef list p_f_string(PyrexScanner s, unicode unicode_value, pos, bint is_raw)
 @cython.locals(i=Py_ssize_t, size=Py_ssize_t, c=Py_UCS4, quote_char=Py_UCS4, NO_CHAR=Py_UCS4)
-cdef tuple p_f_string_expr(PyrexScanner s, unicode_value, pos, Py_ssize_t starting_index)
+cdef tuple p_f_string_expr(PyrexScanner s, unicode_value, pos, Py_ssize_t starting_index, bint is_raw)
 cdef p_list_maker(PyrexScanner s)
 cdef p_comp_iter(PyrexScanner s, body)
 cdef p_comp_for(PyrexScanner s, body)

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -12,7 +12,7 @@ cython.declare(Nodes=object, ExprNodes=object, EncodedString=object,
               FileSourceDescriptor=object, lookup_unicodechar=object,
               Future=object, Options=object, error=object, warning=object,
               Builtin=object, ModuleNode=object, Utils=object,
-               re=object, _unicode=object, _bytes=object,
+               re=object, _parse_escape_sequences=object, _unicode=object, _bytes=object,
               partial=object, reduce=object, _IS_PY3=cython.bint)
 from io import StringIO
@@ -811,8 +811,8 @@ def p_cat_string_literal(s):
            if set([kind, next_kind]) in (set(['f', 'u']), set(['f', ''])):
                kind = 'f'
            else:
-                error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" %
+                error(pos, "Cannot mix string literals of different types, expected %s'', got %s''" % (
-                      (kind, next_kind))
+                    kind, next_kind))
                continue
        bstrings.append(next_bytes_value)
        ustrings.append(next_unicode_value)
@@ -867,18 +867,17 @@ def p_string_literal(s, kind_override=None):
    # s.sy == 'BEGIN_STRING'
    pos = s.position()
-    is_raw = False
    is_python3_source = s.context.language_level >= 3
-    has_non_ascii_literal_characters = False
    kind_string = s.systring.rstrip('"\'').lower()
-    if len(set(kind_string)) != len(kind_string):
+    if len(kind_string) > 1:
-        s.error('Duplicate string prefix character')
+        if len(set(kind_string)) != len(kind_string):
-    if 'b' in kind_string and 'u' in kind_string:
+            error(pos, 'Duplicate string prefix character')
-        s.error('String prefixes b and u cannot be combined')
+        if 'b' in kind_string and 'u' in kind_string:
-    if 'b' in kind_string and 'f' in kind_string:
+            error(pos, 'String prefixes b and u cannot be combined')
-        s.error('String prefixes b and f cannot be combined')
+        if 'b' in kind_string and 'f' in kind_string:
-    if 'u' in kind_string and 'f' in kind_string:
+            error(pos, 'String prefixes b and f cannot be combined')
-        s.error('String prefixes u and f cannot be combined')
+        if 'u' in kind_string and 'f' in kind_string:
+            error(pos, 'String prefixes u and f cannot be combined')
    is_raw = 'r' in kind_string
@@ -886,10 +885,11 @@ def p_string_literal(s, kind_override=None):
        # this should never happen, since the lexer does not allow combining c
        # with other prefix characters
        if len(kind_string) != 1:
-            s.error('Invalid string prefix for character literal')
+            error(pos, 'Invalid string prefix for character literal')
        kind = 'c'
    elif 'f' in kind_string:
-        kind = 'f'  # u is ignored
+        kind = 'f'     # u is ignored
+        is_raw = True  # postpone the escape resolution
    elif 'b' in kind_string:
        kind = 'b'
    elif 'u' in kind_string:
@@ -917,54 +917,11 @@ def p_string_literal(s, kind_override=None):
        # print "p_string_literal: sy =", sy, repr(s.systring) ###
        if sy == 'CHARS':
            chars.append(systr)
-            if is_python3_source and not has_non_ascii_literal_characters and check_for_non_ascii_characters(systr):
-                has_non_ascii_literal_characters = True
        elif sy == 'ESCAPE':
            if is_raw:
                chars.append(systr)
-                if is_python3_source and not has_non_ascii_literal_characters \
-                       and check_for_non_ascii_characters(systr):
-                    has_non_ascii_literal_characters = True
            else:
-                c = systr[1]
+                _append_escape_sequence(kind, chars, systr, s)
-                if c in u"01234567":
-                    chars.append_charval( int(systr[1:], 8) )
-                elif c in u"'\"\\":
-                    chars.append(c)
-                elif c in u"abfnrtv":
-                    chars.append(
-                        StringEncoding.char_from_escape_sequence(systr))
-                elif c == u'\n':
-                    pass
-                elif c == u'x':   # \xXX
-                    if len(systr) == 4:
-                        chars.append_charval( int(systr[2:], 16) )
-                    else:
-                        s.error("Invalid hex escape '%s'" % systr,
-                                fatal=False)
-                elif c in u'NUu' and kind in ('u', 'f', ''):   # \uxxxx, \Uxxxxxxxx, \N{...}
-                    chrval = -1
-                    if c == u'N':
-                        try:
-                            chrval = ord(lookup_unicodechar(systr[3:-1]))
-                        except KeyError:
-                            s.error("Unknown Unicode character name %s" %
-                                    repr(systr[3:-1]).lstrip('u'))
-                    elif len(systr) in (6,10):
-                        chrval = int(systr[2:], 16)
-                        if chrval > 1114111: # sys.maxunicode:
-                            s.error("Invalid unicode escape '%s'" % systr)
-                            chrval = -1
-                    else:
-                        s.error("Invalid unicode escape '%s'" % systr,
-                                fatal=False)
-                    if chrval >= 0:
-                        chars.append_uescape(chrval, systr)
-                else:
-                    chars.append(u'\\' + systr[1:])
-                    if is_python3_source and not has_non_ascii_literal_characters \
-                           and check_for_non_ascii_characters(systr):
-                        has_non_ascii_literal_characters = True
        elif sy == 'NEWLINE':
            chars.append(u'\n')
        elif sy == 'END_STRING':
@@ -972,8 +929,8 @@ def p_string_literal(s, kind_override=None):
        elif sy == 'EOF':
            s.error("Unclosed string literal", pos=pos)
        else:
-            s.error("Unexpected token %r:%r in string literal" %
+            s.error("Unexpected token %r:%r in string literal" % (
-                    (sy, s.systring))
+                sy, s.systring))
    if kind == 'c':
        unicode_value = None
@@ -982,50 +939,119 @@ def p_string_literal(s, kind_override=None):
            error(pos, u"invalid character literal: %r" % bytes_value)
    else:
        bytes_value, unicode_value = chars.getstrings()
-        if is_python3_source and has_non_ascii_literal_characters:
+        if is_python3_source and check_for_non_ascii_characters(unicode_value):
            # Python 3 forbids literal non-ASCII characters in byte strings
            if kind not in ('u', 'f'):
-                s.error("bytes can only contain ASCII literal characters.",
+                error(pos, "bytes can only contain ASCII literal characters.")
-                        pos=pos, fatal=False)
            bytes_value = None
    if kind == 'f':
-        unicode_value = p_f_string(s, unicode_value, pos)
+        unicode_value = p_f_string(s, unicode_value, pos, is_raw='r' in kind_string)
    s.next()
    return (kind, bytes_value, unicode_value)
-def p_f_string(s, unicode_value, pos):
+def _append_escape_sequence(kind, builder, escape_sequence, s):
+    c = escape_sequence[1]
+    if c in u"01234567":
+        builder.append_charval(int(escape_sequence[1:], 8))
+    elif c in u"'\"\\":
+        builder.append(c)
+    elif c in u"abfnrtv":
+        builder.append(StringEncoding.char_from_escape_sequence(escape_sequence))
+    elif c == u'\n':
+        pass  # line continuation
+    elif c == u'x':  # \xXX
+        if len(escape_sequence) == 4:
+            builder.append_charval(int(escape_sequence[2:], 16))
+        else:
+            s.error("Invalid hex escape '%s'" % escape_sequence, fatal=False)
+    elif c in u'NUu' and kind in ('u', 'f', ''):  # \uxxxx, \Uxxxxxxxx, \N{...}
+        chrval = -1
+        if c == u'N':
+            try:
+                chrval = ord(lookup_unicodechar(escape_sequence[3:-1]))
+            except KeyError:
+                s.error("Unknown Unicode character name %s" %
+                        repr(escape_sequence[3:-1]).lstrip('u'), fatal=False)
+        elif len(escape_sequence) in (6, 10):
+            chrval = int(escape_sequence[2:], 16)
+            if chrval > 1114111:  # sys.maxunicode:
+                s.error("Invalid unicode escape '%s'" % escape_sequence)
+                chrval = -1
+        else:
+            s.error("Invalid unicode escape '%s'" % escape_sequence, fatal=False)
+        if chrval >= 0:
+            builder.append_uescape(chrval, escape_sequence)
+    else:
+        builder.append(escape_sequence)
+_parse_escape_sequences = re.compile(
+    # escape sequences:
+    br'(\\(?:'
+    br'[\\abfnrtv"\'{]|'
+    br'[0-7]{2,3}|'
+    br'N\{[^}]*\}|'
+    br'x[0-9a-fA-F]{2}|'
+    br'u[0-9a-fA-F]{4}|'
+    br'U[0-9a-fA-F]{8}|'
+    br'[NuU]|'  # detect invalid escape sequences that do not match above
+    br')?|'
+    # non-escape sequences:
+    br'\{\{?|'
+    br'\}\}?|'
+    br'[^\\{}]+)'.decode('us-ascii')
+).match
+def p_f_string(s, unicode_value, pos, is_raw):
    # Parses a PEP 498 f-string literal into a list of nodes. Nodes are either UnicodeNodes
    # or FormattedValueNodes.
    values = []
-    i = 0
+    next_start = 0
    size = len(unicode_value)
-    current_literal_start = 0
+    builder = StringEncoding.UnicodeLiteralBuilder()
-    while i < size:
-        c = unicode_value[i]
+    while next_start < size:
-        if c in '{}':
+        end = next_start
-            if i + 1 < size and unicode_value[i + 1] == c:
+        match = _parse_escape_sequences(unicode_value, next_start)
-                encoded_str = EncodedString(unicode_value[current_literal_start:i + 1])
+        if match is None:
-                values.append(ExprNodes.UnicodeNode(pos, value=encoded_str))
+            error_pos = (pos[0], pos[1] + end, pos[2])  # FIXME: handle newlines in string
-                i += 2
+            error(error_pos, "Invalid escape sequence")
-                current_literal_start = i
-            elif c == '}':
+        next_start = match.end()
-                s.error("single '}' encountered in format string")
+        part = match.group()
+        c = part[0]
+        if c == '\\':
+            if not is_raw and len(part) > 1:
+                _append_escape_sequence('f', builder, part, s)
            else:
-                encoded_str = EncodedString(unicode_value[current_literal_start:i])
+                builder.append(part)
-                values.append(ExprNodes.UnicodeNode(pos, value=encoded_str))
+        elif c == '{':
-                i, expr_node = p_f_string_expr(s, unicode_value, pos, i + 1)
+            if part == '{{':
-                current_literal_start = i
+                builder.append('{')
+            else:
+                # start of an expression
+                if builder.chars:
+                    values.append(ExprNodes.UnicodeNode(pos, value=builder.getstring()))
+                    builder = StringEncoding.UnicodeLiteralBuilder()
+                next_start, expr_node = p_f_string_expr(s, unicode_value, pos, next_start, is_raw)
                values.append(expr_node)
+        elif c == '}':
+            if part == '}}':
+                builder.append('}')
+            else:
+                error_pos = (pos[0], pos[1] + end, pos[2])  # FIXME: handle newlines in string
+                s.error("f-string: single '}' is not allowed", pos=error_pos)
        else:
-            i += 1
+            builder.append(part)
-    encoded_str = EncodedString(unicode_value[current_literal_start:])
+    if builder.chars:
-    values.append(ExprNodes.UnicodeNode(pos, value=encoded_str))
+        values.append(ExprNodes.UnicodeNode(pos, value=builder.getstring()))
    return values
-def p_f_string_expr(s, unicode_value, pos, starting_index):
+def p_f_string_expr(s, unicode_value, pos, starting_index, is_raw):
    # Parses a {}-delimited expression inside an f-string. Returns a FormattedValueNode
    # and the index in the string that follows the expression.
    i = starting_index
@@ -1045,7 +1071,8 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
        if quote_char != NO_CHAR:
            if c == '\\':
-                i += 1
+                error_pos = (pos[0], pos[1] + i, pos[2])  # FIXME: handle newlines in string
+                error(error_pos, "backslashes not allowed in f-strings")
            elif c == quote_char:
                if in_triple_quotes:
                    if i + 2 < size and unicode_value[i + 1] == c and unicode_value[i + 2] == c:
@@ -1080,15 +1107,16 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
    expr_pos = (pos[0], pos[1], pos[2] + starting_index + 2)  # TODO: find exact code position (concat, multi-line, ...)
    if not expr_str.strip():
-        s.error("empty expression not allowed in f-string")
+        error(pos, "empty expression not allowed in f-string")
    if terminal_char == '!':
        i += 1
        if i + 2 > size:
-            s.error("invalid conversion char at end of string")
+            error(pos, "invalid conversion char at end of string")
-        conversion_char = unicode_value[i]
+        else:
-        i += 1
+            conversion_char = unicode_value[i]
-        terminal_char = unicode_value[i]
+            i += 1
+            terminal_char = unicode_value[i]
    if terminal_char == ':':
        in_triple_quotes = False
@@ -1128,14 +1156,14 @@ def p_f_string_expr(s, unicode_value, pos, starting_index):
    # validate the conversion char
    if conversion_char is not None and not ExprNodes.FormattedValueNode.find_conversion_func(conversion_char):
-        s.error("invalid conversion character '%s'" % conversion_char)
+        error(pos, "invalid conversion character '%s'" % conversion_char)
    # the format spec is itself treated like an f-string
    if format_spec_str:
-        format_spec = ExprNodes.JoinedStrNode(pos, values=p_f_string(s, format_spec_str, pos))
+        format_spec = ExprNodes.JoinedStrNode(pos, values=p_f_string(s, format_spec_str, pos, is_raw))
    return i + 1, ExprNodes.FormattedValueNode(
-        s.position(), value=expr, conversion_char=conversion_char, format_spec=format_spec)
+        pos, value=expr, conversion_char=conversion_char, format_spec=format_spec)
 # since PEP 448:

--- a/tests/run/fstring.pyx
+++ b/tests/run/fstring.pyx
@@ -18,6 +18,15 @@ max_long = LONG_MAX
 min_long = LONG_MIN
+def escaping():
+    """
+    >>> escaping()
+    """
+    assert f'{{{{{"abc"}}}}}{{}}{{' == '{{abc}}{}{'
+    assert f'\x7b}}' == '{}'
+    assert f'{"{{}}"}' == '{{}}'
 def format2(ab, cd):
    """
    >>> a, b, c = format2(1, 2)