Commit 02dddb97 authored by Kirill Smelkov's avatar Kirill Smelkov

gcompat: Teach qq to accept both str and unicode + emit printable UTF-8 as is

This patch made its first step as a way to teach qq to also work on
python3. However the differences in str / unicode and escapes in between
py2 / py3 quickly popped out and then it became easier to just handle
whole escaping logic myself.

The implementation is based on

	kirr/go123@c0bbd06e

and byproduct of manual handling is that now we don't escape printable
UTF-8 characters.
parent 812e7ed7
......@@ -19,26 +19,109 @@
# See https://www.nexedi.com/licensing for rationale and options.
"""Package gcompat provides Go-compatibility layer for Python"""
import six, unicodedata
from six.moves import range as xrange
# qq is substitute for %q, which is missing in python.
#
# (python's automatic escape uses smartquotes quoting with either ' or ").
def qq(obj):
# go: like %s, %q automatically converts to string
if not isinstance(obj, basestring):
decode_utf8 = False
if isinstance(obj, bytes): # py2: str py3: bytes
if six.PY3:
decode_utf8 = True
elif not isinstance(obj, six.text_type): # py2: unicode py3: str
obj = str(obj)
return _quote(obj)
# _quote quotes string into valid "..." string always quoted with ".
if isinstance(obj, six.text_type): # py2: unicode py3: str -> bytes
obj = obj.encode('UTF-8')
decode_utf8 = True
qobj = _quote(obj)
if decode_utf8:
qobj = qobj.decode('UTF-8')
return qobj
# _quote quotes bytes string into valid "..." bytes string always quoted with ".
def _quote(s):
# TODO also accept unicode as input.
# TODO output printable UTF-8 characters as-is, but escape non-printable UTF-8 and invalid UTF-8 bytes.
outv = []
# we don't want ' to be escaped
for _ in s.split("'"):
# this escape almost everything except " character
# NOTE string_escape does not do smartquotes and always uses ' for quoting
# (repr(str) is the same except it does smartquoting picking ' or " automatically)
q = _.encode("string_escape")
q = q.replace('"', r'\"')
outv.append(q)
return '"' + "'".join(outv) + '"'
emit = outv.append
i = 0
while i < len(s):
c = s[i:i+1]
# fast path - ASCII only
if ord(c) < 0x80:
if c in b'\\"':
emit(b'\\'+c)
# printable ASCII
elif b' ' <= c <= b'\x7e':
emit(c)
# non-printable ASCII
elif c == b'\t':
emit(br'\t')
elif c == b'\n':
emit(br'\n')
elif c == b'\r':
emit(br'\r')
# everything else is non-printable
else:
emit(br'\x%02x' % ord(c))
i += 1
# slow path - full UTF-8 decoding + unicodedata
else:
r, size = _utf8_decode_rune(s[i:])
isize = i + size
# decode error - just emit raw byte as escaped
if r == _rune_error:
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
elif unicodedata.category(r)[0] in _printable_cat0:
emit(s[i:isize])
# everything else goes in numeric byte escapes
else:
for j in xrange(i, isize):
emit(br'\x%02x' % ord(s[j:j+1]))
i = isize
return b'"' + b''.join(outv) + b'"'
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
def _utf8_decode_rune(s):
if len(s) == 0:
return '', 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
try:
r = s[:l].decode('utf-8', 'strict')
except UnicodeDecodeError:
l -= 1
continue
if len(r) == 1:
return r, l
l -= 1
continue
# invalid UTF-8
return _rune_error, 1
......@@ -19,17 +19,38 @@
# See https://www.nexedi.com/licensing for rationale and options.
from golang.gcompat import qq
from six import int2byte as bchr
from six.moves import range as xrange
def byterange(start, stop):
b = b""
for i in xrange(start, stop):
b += bchr(i)
return b
def test_qq():
testv = (
# in want without leading/trailing "
('', r""),
(byterange(0,32), r'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'),
('\'', r"'"),
('"', r"\""),
('abc\ndef', r"abc\ndef"),
('ab c\ndef', r"ab c\ndef"),
('a\'c\ndef', r"a'c\ndef"),
('a\"c\ndef', r"a\"c\ndef"),
# ('привет', r"привет"), TODO
(u'a\"c\ndef', u"a\\\"c\\ndef"),
(b'a\"c\ndef', r'a\"c\ndef'),
('привет\nмир', r"привет\nмир"),
(u'привет\nмир', u"привет\\nмир"),
# invalid utf-8
(b"\xd0a", r"\xd0a"),
# non-printable utf-8
(u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087", u"\\x7f\\xc2\\x80\\xc2\\x81\\xc2\\x82\\xc2\\x83\\xc2\\x84\\xc2\\x85\\xc2\\x86\\xc2\\x87"),
)
for tin, twant in testv:
......
......@@ -20,6 +20,8 @@ setup(
packages = find_packages(),
install_requires = ['six'],
extras_require = {
'test': ['pytest'],
},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment