Commit e4d5cb21 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Treat bytearray also as bytestring, just mutable

bytearray was introduced in Python as a mutable version of bytes. It has
all strings methods (e.g. .capitalize() .islower(), etc), and it also
supports % formatting. In other words it has all attributes of being a
byte-string, with the only difference from bytes in that bytearray is
mutable. In other words bytearray is handy to have when a string is
being incrementally constructed step by step without hitting overhead of
many bytes objects creation/destruction.

So, since bytearray is also a bytestring, similarly to bytes, let's add
support to interoperate with bytearray to bstr and ustr:

- b/u and bstr/ustr now accept bytearray as argument and treat it as bytestring.
- bytearray() constructor, similarly to bytes() and unicode()
  constructors, now also accepts bstr/ustr and create bytearray object
  corresponding to byte-stream of input.

For the latter point to work we need to patch bytearray.__init__() a bit,
since, contrary to bytes.__init__(), it does not pay attention to
whether provided argument has __bytes__ method or not.
parent 781802d4
......@@ -240,16 +240,16 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8.
Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to
Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`
`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`
to Pygolang string, `b` and `u` provide way to make sure an
object is either `bstr` or `ustr` correspondingly.
......@@ -258,7 +258,7 @@ Usage example::
s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'.
def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr or bytes.
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes or bytearray.
... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3.
......
......@@ -29,6 +29,11 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
cdef extern from "Python.h":
void PyType_Modified(PyTypeObject *)
cdef extern from "Python.h":
ctypedef int (*initproc)(object, PyObject *, PyObject *) except -1
ctypedef struct _XPyTypeObject "PyTypeObject":
initproc tp_init
from libc.stdint cimport uint8_t
pystrconv = None # = golang.strconv imported at runtime (see __init__.py)
......@@ -39,7 +44,7 @@ def pyb(s): # -> bstr
"""b converts object to bstr.
- For bstr the same object is returned.
- For bytes the data is
- For bytes or bytearray the data is
preserved as-is and only result type is changed to bstr.
- For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.
......@@ -61,7 +66,7 @@ def pyu(s): # -> ustr
- For ustr the same object is returned.
- For unicode the data is preserved as-is and only result type is changed to ustr.
- For bstr or bytes the data is UTF-8 decoded.
- For bstr, bytes or bytearray the data is UTF-8 decoded.
The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF.
......@@ -89,6 +94,9 @@ cdef _pyb(bcls, s): # -> ~bstr | None
s = _bdata(s)
elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s)
else:
if isinstance(s, bytearray):
s = bytes(s)
else:
return None
......@@ -102,7 +110,10 @@ cdef _pyu(ucls, s): # -> ~ustr | None
if isinstance(s, unicode):
if type(s) is not unicode:
s = _udata(s)
elif isinstance(s, bytes):
else:
if isinstance(s, bytearray):
s = bytes(s)
if isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s)
else:
return None
......@@ -115,7 +126,7 @@ cdef _pyu(ucls, s): # -> ~ustr | None
cdef _pyb_coerce(x): # -> bstr|bytes
if isinstance(x, bytes):
return x
elif isinstance(x, unicode):
elif isinstance(x, (unicode, bytearray)):
return pyb(x)
else:
raise TypeError("b: coerce: invalid type %s" % type(x))
......@@ -124,7 +135,7 @@ cdef _pyb_coerce(x): # -> bstr|bytes
cdef _pyu_coerce(x): # -> ustr|unicode
if isinstance(x, unicode):
return x
elif isinstance(x, bytes):
elif isinstance(x, (bytes, bytearray)):
return pyu(x)
else:
raise TypeError("u: coerce: invalid type %s" % type(x))
......@@ -160,8 +171,8 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8.
Operations in between bstr and ustr/unicode / bytes coerce to bstr.
When the coercion happens, bytes, similarly to bstr, are also
Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
bstr constructor accepts arbitrary objects and stringify them:
......@@ -169,7 +180,7 @@ class pybstr(bytes):
- if encoding and/or errors is specified, the object must provide buffer
interface. The data in the buffer is decoded according to provided
encoding/errors and further encoded via UTF-8 into bstr.
- if the object is bstr/ustr / unicode/bytes - it is converted
- if the object is bstr/ustr / unicode/bytes/bytearray - it is converted
to bstr. See b for details.
- otherwise bstr will have string representation of the object.
......@@ -240,8 +251,8 @@ class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8.
Operations in between ustr and bstr/bytes / unicode coerce to ustr.
When the coercion happens, bytes, similarly to bstr, are also
Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings.
ustr constructor, similarly to the one in bstr, accepts arbitrary objects
......@@ -379,6 +390,8 @@ def pyqq(obj):
cdef _bstringify(object obj): # -> unicode|bytes
if type(obj) in (pybstr, pyustr, bytes, unicode):
return obj
if type(obj) is bytearray:
return bytes(obj)
if PY_MAJOR_VERSION >= 3:
return unicode(obj)
......@@ -440,6 +453,39 @@ if PY_MAJOR_VERSION < 3:
_()
# patch:
#
# - bytearray.__init__ to accept ustr instead of raising 'TypeError:
# string argument without an encoding' (pybug: bytearray() should respect
# __bytes__ similarly to bytes)
cdef initproc _bytearray_tp_init = (<_XPyTypeObject*>bytearray) .tp_init
cdef int _bytearray_tp_xinit(object self, PyObject* args, PyObject* kw) except -1:
if args != NULL and (kw == NULL or (not <object>kw)):
argv = <object>args
if isinstance(argv, tuple) and len(argv) == 1:
arg = argv[0]
if isinstance(arg, pyustr):
argv = (pyb(arg),) # NOTE argv is kept alive till end of function
args = <PyObject*>argv # no need to incref it
return _bytearray_tp_init(self, args, kw)
def _bytearray_x__init__(self, *argv, **kw):
# NOTE don't return - just call: __init__ should return None
_bytearray_tp_xinit(self, <PyObject*>argv, <PyObject*>kw)
def _():
cdef PyTypeObject* t
for pyt in [bytearray] + bytearray.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
t_ = <_XPyTypeObject*>t
if t_.tp_init == _bytearray_tp_init:
t_.tp_init = _bytearray_tp_xinit
_patch_slot(t, '__init__', _bytearray_x__init__)
_()
# _patch_slot installs func_or_descr into typ's __dict__ as name.
#
# if func_or_descr is descriptor (has __get__), it is installed as is.
......
......@@ -35,8 +35,11 @@ import array
# buftypes lists types with buffer interface that we will test against.
#
# NOTE bytearray is not included here - being bytes-like object it is handled
# and tested explicitly in tests that exercise interaction of bstr/ustr with
# bytes/unicode and bytearray.
buftypes = [
bytearray,
memoryview,
lambda x: array.array('B', x),
]
......@@ -125,7 +128,7 @@ def test_strings_basic():
assert ub_tunicode_ == tunicode
# b/u accept only ~bytes/~unicode
# b/u accept only ~bytes/~unicode/bytearray
with raises(TypeError): b()
with raises(TypeError): u()
with raises(TypeError): b(123)
......@@ -149,6 +152,7 @@ def test_strings_basic():
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode
ba_ = xbytearray("мир"); assert type(ba_) is bytearray
# b/u from unicode
bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr
......@@ -162,11 +166,15 @@ def test_strings_basic():
_ = bstr(b_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray?
# b/u from bytearray
_ = b(ba_); assert type(_) is bstr; assert _ == "мир"
_ = u(ba_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(ba_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(ba_); assert type(_) is ustr; assert _ == "мир"
# bstr/ustr from bytes/buffer with encoding
# bstr/ustr from bytes/bytearray/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes] + buftypes:
for tbuf in [bytes, bytearray] + buftypes:
k8mir = tbuf(k8mir_bytes)
_ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир"
_ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир"
......@@ -174,10 +182,10 @@ def test_strings_basic():
with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
_ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd'
_ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd'
# no encoding -> utf8 with surrogateescape for bytes, stringify for the rest
# no encoding -> utf8 with surrogateescape for bytes/bytearray, stringify for the rest
k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
k8mir_strok = k8mir_usurrogateescape
if not tbuf in (bytes,):
if not tbuf in (bytes, bytearray):
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
......@@ -202,6 +210,10 @@ def test_strings_basic():
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# bytearray(b|u) -> bytearray
_ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
_ = bytearray(us); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
# b(u(·)), u(b(·))
_ = b(us); assert type(_) is bstr; assert _ == "мир"
_ = u(bs); assert type(_) is ustr; assert _ == "мир"
......@@ -226,14 +238,14 @@ def test_strings_basic():
# verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr and ustr. Except if both x and y are std
# bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled
# only by builtin python code and might be rejected.
@mark.parametrize('tx', (bytes, unicode, bstr, ustr))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr))
@mark.parametrize('tx', (bytes, unicode, bstr, ustr, bytearray))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr, bytearray))
def test_strings_ops2(tx, ty):
# skip e.g. regular bytes vs regular unicode
tstd = {bytes, unicode}
tstd = {bytes, unicode, bytearray}
if tx in tstd and ty in tstd and tx is not ty:
skip()
......@@ -383,6 +395,7 @@ def test_qq():
# verify that what we patched stay unaffected when
# called outside of bstr/ustr context.
def test_strings_patched_transparently():
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode
# unicode comparison stay unaffected
......@@ -401,6 +414,25 @@ def test_strings_patched_transparently():
assert (u_ <= u2) is False ; assert (u2 <= u_) is True
assert (u_ >= u2) is True ; assert (u2 >= u_) is False
# bytearray.__init__ stay unaffected
with raises(TypeError): bytearray(u'мир')
a = bytearray()
with raises(TypeError): a.__init__(u'мир')
def _(*argv):
a = bytearray(*argv)
b = bytearray(); _ = b.__init__(*argv); assert _ is None
ra = repr(a)
rb = repr(b)
assert ra == rb
return ra
assert _() == r"bytearray(b'')"
assert _(b_) == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(u_, 'utf-8') == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(3) == r"bytearray(b'\x00\x00\x00')"
assert _((1,2,3)) == r"bytearray(b'\x01\x02\x03')"
# ---- benchmarks ----
......@@ -431,10 +463,11 @@ def bench_bencode(b):
# ---- misc ----
# xbytes/xunicode convert provided bytes/unicode object to bytes or
# unicode correspondingly to function name.
# xbytes/xunicode/xbytearray convert provided bytes/unicode object to bytes,
# unicode or bytearray correspondingly to function name.
def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x
def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x
def xbytearray(x): return bytearray(xbytes(x))
# xstr returns string corresponding to specified type and data.
def xstr(text, typ):
......@@ -442,6 +475,7 @@ def xstr(text, typ):
t = {
bytes: xbytes,
unicode: xunicode,
bytearray: xbytearray,
bstr: b,
ustr: u,
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment