Commit ed6b7895 authored by Kirill Smelkov's avatar Kirill Smelkov

strconv += unquote(), unquote_next()

This are functions to decode quotation that was produced by
strconv.quote().
parent f09701b0
...@@ -135,7 +135,7 @@ without escaping printable UTF-8 characters:: ...@@ -135,7 +135,7 @@ without escaping printable UTF-8 characters::
and also any other type that can be converted to `str`. and also any other type that can be converted to `str`.
Package `golang.strconv` provides direct access to conversion routines, for Package `golang.strconv` provides direct access to conversion routines, for
example `strconv.quote`. example `strconv.quote` and `strconv.unquote`.
Benchmarking and testing Benchmarking and testing
......
...@@ -102,6 +102,93 @@ def _quote(s): ...@@ -102,6 +102,93 @@ def _quote(s):
return b'"' + b''.join(outv) + b'"' return b'"' + b''.join(outv) + b'"'
# unquote decodes unicode|byte string that was produced by quote.
#
# ValueError is raised if there are quoting syntax errors.
def unquote(s):
us, tail = unquote_next(s)
if len(tail) != 0:
raise ValueError('non-empty tail after closing "')
return us
# unquote_next decodes next unicode|byte string that was produced by quote.
#
# it returns -> (unquoted(s), tail-after-")
#
# ValueError is raised if there are quoting syntax errors.
def unquote_next(s):
s, wasunicode = _bstr(s)
us, tail = _unquote_next(s)
if wasunicode:
us = us.decode('UTF-8')
tail = tail.decode('UTF-8')
return us, tail
def _unquote_next(s):
assert isinstance(s, bytes)
if len(s) == 0 or s[0:0+1] != b'"':
raise ValueError('no starting "')
outv = []
emit= outv.append
s = s[1:]
while 1:
r, width = _utf8_decode_rune(s)
if width == 0:
raise ValueError('no closing "')
if r == u'"':
s = s[1:]
break
# regular UTF-8 character
if r != u'\\':
emit(s[:width])
s = s[width:]
continue
if len(s) < 2:
raise ValueError('unexpected EOL after \\')
c = s[1:1+1]
# \<c> -> <c> ; c = \ "
if c in b'\\"':
emit(c)
s = s[2:]
continue
if c == b't':
emit(b'\t')
s = s[2:]
continue
if c == b'n':
emit(b'\n')
s = s[2:]
continue
if c == b'r':
emit(b'\r')
s = s[2:]
continue
if c == b'x': # hex XXX also handle octals?
if len(s) < 2+2:
raise ValueError('unexpected EOL after \\x')
b = codecs.decode(s[2:2+2], 'hex')
emit(b)
s = s[2+2:]
continue
raise ValueError('invalid escape \\%s' % chr(ord(c[0:0+1])))
return b''.join(outv), s
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols _printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character _rune_error = u'\uFFFD' # unicode replacement character
......
...@@ -18,11 +18,12 @@ ...@@ -18,11 +18,12 @@
# See COPYING file for full licensing terms. # See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options. # See https://www.nexedi.com/licensing for rationale and options.
from golang.strconv import quote from golang.strconv import quote, unquote, unquote_next
from golang.gcompat import qq from golang.gcompat import qq
from six import int2byte as bchr, PY3 from six import int2byte as bchr, PY3
from six.moves import range as xrange from six.moves import range as xrange
from pytest import raises
def byterange(start, stop): def byterange(start, stop):
b = b"" b = b""
...@@ -62,10 +63,16 @@ def test_quote(): ...@@ -62,10 +63,16 @@ def test_quote():
for tin, tquoted in testv: for tin, tquoted in testv:
# quote(in) == quoted # quote(in) == quoted
# in = unquote(quoted)
q = b'"' if isinstance(tquoted, bytes) else '"' q = b'"' if isinstance(tquoted, bytes) else '"'
tail = b'123' if isinstance(tquoted, bytes) else '123'
tquoted = q + tquoted + q # add lead/trail " tquoted = q + tquoted + q # add lead/trail "
assert quote(tin) == tquoted assert quote(tin) == tquoted
assert unquote(tquoted) == tin
assert unquote_next(tquoted) == (tin, type(tin)())
assert unquote_next(tquoted + tail) == (tin, tail)
raises(ValueError, 'unquote(tquoted + tail)')
# qq always gives str # qq always gives str
assert qq(tin) == asstr(tquoted) assert qq(tin) == asstr(tquoted)
...@@ -78,12 +85,35 @@ def test_quote(): ...@@ -78,12 +85,35 @@ def test_quote():
# some inputs are not valid UTF-8 # some inputs are not valid UTF-8
continue continue
tquoted = tquoted.decode('utf-8') tquoted = tquoted.decode('utf-8')
tail = tail.decode('utf-8')
else: else:
# tin was unicode # tin was unicode
tin = tin.encode('utf-8') tin = tin.encode('utf-8')
tquoted = tquoted.encode('utf-8') tquoted = tquoted.encode('utf-8')
tail = tail.encode('utf-8')
assert quote(tin) == tquoted assert quote(tin) == tquoted
assert unquote(tquoted) == tin
assert unquote_next(tquoted) == (tin, type(tin)())
assert unquote_next(tquoted + tail) == (tin, tail)
raises(ValueError, 'unquote(tquoted + tail)')
# qq always gives str # qq always gives str
assert qq(tin) == asstr(tquoted) assert qq(tin) == asstr(tquoted)
def test_unquote_bad():
testv = (
# in error
('x"zzz"', 'no starting "'),
('"zzz', 'no closing "'),
('"\\', 'unexpected EOL after \\'),
('"\\x', 'unexpected EOL after \\x'),
('"\\x0', 'unexpected EOL after \\x'),
('"\\z"', 'invalid escape \\z'),
)
for tin, err in testv:
with raises(ValueError) as exc:
unquote(tin)
assert exc.value.args == (err,)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment