Commit 78b4b41c authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Fix unquote to handle \a \b \v \f

Those are quote codes that Go strconv.Quote might produce. And even
though Python does not use them when quoting, it too handles those quote
codes when decoding:

    In [1]: '\r'
    Out[1]: '\r'

    In [2]: '\a\b\v\f'
    Out[2]: '\x07\x08\x0b\x0c'

https://github.com/python/cpython/blob/2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L677-L688

-> Teach strconv.unquote + friends to handle them as well.

/reviewed-by @jerome
/reviewed-on !14
parent 4f28dddf
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2020 Nexedi SA and Contributors. # Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -134,7 +134,7 @@ def _quote(s): ...@@ -134,7 +134,7 @@ def _quote(s):
return b'"' + b''.join(outv) + b'"' return b'"' + b''.join(outv) + b'"'
# unquote decodes unicode|byte string that was produced by quote. # unquote decodes "-quoted unicode|byte string.
# #
# ValueError is raised if there are quoting syntax errors. # ValueError is raised if there are quoting syntax errors.
def unquote(s): def unquote(s):
...@@ -143,7 +143,7 @@ def unquote(s): ...@@ -143,7 +143,7 @@ def unquote(s):
raise ValueError('non-empty tail after closing "') raise ValueError('non-empty tail after closing "')
return us return us
# unquote_next decodes next unicode|byte string that was produced by quote. # unquote_next decodes next "-quoted unicode|byte string.
# #
# it returns -> (unquoted(s), tail-after-") # it returns -> (unquoted(s), tail-after-")
# #
...@@ -192,22 +192,26 @@ def _unquote_next(s): ...@@ -192,22 +192,26 @@ def _unquote_next(s):
s = s[2:] s = s[2:]
continue continue
if c == b't': # \t \n \r
emit(b'\t') uc = None
if c == b't': uc = b'\t'
elif c == b'n': uc = b'\n'
elif c == b'r': uc = b'\r'
# accept also \a \b \v \f that Go might produce
# Python also decodes those escapes even though it does not produce them:
# https://github.com/python/cpython/blob/2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L677-L688
elif c == b'a': uc = b'\x07'
elif c == b'b': uc = b'\x08'
elif c == b'v': uc = b'\x0b'
elif c == b'f': uc = b'\x0c'
if uc is not None:
emit(uc)
s = s[2:] s = s[2:]
continue continue
if c == b'n': # \x?? hex
emit(b'\n') if c == b'x': # XXX also handle octals?
s = s[2:]
continue
if c == b'r':
emit(b'\r')
s = s[2:]
continue
if c == b'x': # hex XXX also handle octals?
if len(s) < 2+2: if len(s) < 2+2:
raise ValueError('unexpected EOL after \\x') raise ValueError('unexpected EOL after \\x')
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2019 Nexedi SA and Contributors. # Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -110,6 +110,21 @@ def test_quote(): ...@@ -110,6 +110,21 @@ def test_quote():
assert qq(tin) == asstr(tquoted) assert qq(tin) == asstr(tquoted)
# verify that non-canonical quotation can be unquoted too.
def test_unquote_noncanon():
testv = (
# quoted w/o " unquoted
(r'\a', "\x07"),
(r'\b', "\x08"),
(r'\v', "\x0b"),
(r'\f', "\x0c"),
)
for tquoted, tunquoted in testv:
q = '"' + tquoted + '"'
assert unquote(q) == tunquoted
def test_unquote_bad(): def test_unquote_bad():
testv = ( testv = (
# in error # in error
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment