Commit e4d5cb21 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Treat bytearray also as bytestring, just mutable

bytearray was introduced in Python as a mutable version of bytes. It has
all strings methods (e.g. .capitalize() .islower(), etc), and it also
supports % formatting. In other words it has all attributes of being a
byte-string, with the only difference from bytes in that bytearray is
mutable. In other words bytearray is handy to have when a string is
being incrementally constructed step by step without hitting overhead of
many bytes objects creation/destruction.

So, since bytearray is also a bytestring, similarly to bytes, let's add
support to interoperate with bytearray to bstr and ustr:

- b/u and bstr/ustr now accept bytearray as argument and treat it as bytestring.
- bytearray() constructor, similarly to bytes() and unicode()
  constructors, now also accepts bstr/ustr and create bytearray object
  corresponding to byte-stream of input.

For the latter point to work we need to patch bytearray.__init__() a bit,
since, contrary to bytes.__init__(), it does not pay attention to
whether provided argument has __bytes__ method or not.
parent 781802d4
...@@ -240,16 +240,16 @@ The conversion, in both encoding and decoding, never fails and never looses ...@@ -240,16 +240,16 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8. even if bytes data is not valid UTF-8.
Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to to `ustr`. When the coercion happens, `bytes` and `bytearray`, similarly to
`bstr`, are also treated as UTF8-encoded strings. `bstr`, are also treated as UTF8-encoded strings.
`bstr` and `ustr` are meant to be drop-in replacements for standard `bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in `str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes` `bstr`/`ustr` / `unicode`/`bytes`/`bytearray`
to Pygolang string, `b` and `u` provide way to make sure an to Pygolang string, `b` and `u` provide way to make sure an
object is either `bstr` or `ustr` correspondingly. object is either `bstr` or `ustr` correspondingly.
...@@ -258,7 +258,7 @@ Usage example:: ...@@ -258,7 +258,7 @@ Usage example::
s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'. s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'.
def f(s): def f(s):
s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr or bytes. s = u(s) # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes or bytearray.
... # (*) the decoding never fails nor looses information. ... # (*) the decoding never fails nor looses information.
.. [*] `unicode` on Python2, `str` on Python3. .. [*] `unicode` on Python2, `str` on Python3.
......
...@@ -29,6 +29,11 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE ...@@ -29,6 +29,11 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
cdef extern from "Python.h": cdef extern from "Python.h":
void PyType_Modified(PyTypeObject *) void PyType_Modified(PyTypeObject *)
cdef extern from "Python.h":
ctypedef int (*initproc)(object, PyObject *, PyObject *) except -1
ctypedef struct _XPyTypeObject "PyTypeObject":
initproc tp_init
from libc.stdint cimport uint8_t from libc.stdint cimport uint8_t
pystrconv = None # = golang.strconv imported at runtime (see __init__.py) pystrconv = None # = golang.strconv imported at runtime (see __init__.py)
...@@ -39,7 +44,7 @@ def pyb(s): # -> bstr ...@@ -39,7 +44,7 @@ def pyb(s): # -> bstr
"""b converts object to bstr. """b converts object to bstr.
- For bstr the same object is returned. - For bstr the same object is returned.
- For bytes the data is - For bytes or bytearray the data is
preserved as-is and only result type is changed to bstr. preserved as-is and only result type is changed to bstr.
- For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds. - For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.
...@@ -61,7 +66,7 @@ def pyu(s): # -> ustr ...@@ -61,7 +66,7 @@ def pyu(s): # -> ustr
- For ustr the same object is returned. - For ustr the same object is returned.
- For unicode the data is preserved as-is and only result type is changed to ustr. - For unicode the data is preserved as-is and only result type is changed to ustr.
- For bstr or bytes the data is UTF-8 decoded. - For bstr, bytes or bytearray the data is UTF-8 decoded.
The decoding always succeeds and input The decoding always succeeds and input
information is not lost: non-valid UTF-8 bytes are decoded into information is not lost: non-valid UTF-8 bytes are decoded into
surrogate codes ranging from U+DC80 to U+DCFF. surrogate codes ranging from U+DC80 to U+DCFF.
...@@ -90,7 +95,10 @@ cdef _pyb(bcls, s): # -> ~bstr | None ...@@ -90,7 +95,10 @@ cdef _pyb(bcls, s): # -> ~bstr | None
elif isinstance(s, unicode): elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s) s = _utf8_encode_surrogateescape(s)
else: else:
return None if isinstance(s, bytearray):
s = bytes(s)
else:
return None
assert type(s) is bytes assert type(s) is bytes
return bytes.__new__(bcls, s) return bytes.__new__(bcls, s)
...@@ -102,10 +110,13 @@ cdef _pyu(ucls, s): # -> ~ustr | None ...@@ -102,10 +110,13 @@ cdef _pyu(ucls, s): # -> ~ustr | None
if isinstance(s, unicode): if isinstance(s, unicode):
if type(s) is not unicode: if type(s) is not unicode:
s = _udata(s) s = _udata(s)
elif isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s)
else: else:
return None if isinstance(s, bytearray):
s = bytes(s)
if isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s)
else:
return None
assert type(s) is unicode assert type(s) is unicode
return unicode.__new__(ucls, s) return unicode.__new__(ucls, s)
...@@ -115,7 +126,7 @@ cdef _pyu(ucls, s): # -> ~ustr | None ...@@ -115,7 +126,7 @@ cdef _pyu(ucls, s): # -> ~ustr | None
cdef _pyb_coerce(x): # -> bstr|bytes cdef _pyb_coerce(x): # -> bstr|bytes
if isinstance(x, bytes): if isinstance(x, bytes):
return x return x
elif isinstance(x, unicode): elif isinstance(x, (unicode, bytearray)):
return pyb(x) return pyb(x)
else: else:
raise TypeError("b: coerce: invalid type %s" % type(x)) raise TypeError("b: coerce: invalid type %s" % type(x))
...@@ -124,7 +135,7 @@ cdef _pyb_coerce(x): # -> bstr|bytes ...@@ -124,7 +135,7 @@ cdef _pyb_coerce(x): # -> bstr|bytes
cdef _pyu_coerce(x): # -> ustr|unicode cdef _pyu_coerce(x): # -> ustr|unicode
if isinstance(x, unicode): if isinstance(x, unicode):
return x return x
elif isinstance(x, bytes): elif isinstance(x, (bytes, bytearray)):
return pyu(x) return pyu(x)
else: else:
raise TypeError("u: coerce: invalid type %s" % type(x)) raise TypeError("u: coerce: invalid type %s" % type(x))
...@@ -160,8 +171,8 @@ class pybstr(bytes): ...@@ -160,8 +171,8 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
Operations in between bstr and ustr/unicode / bytes coerce to bstr. Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
When the coercion happens, bytes, similarly to bstr, are also When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
bstr constructor accepts arbitrary objects and stringify them: bstr constructor accepts arbitrary objects and stringify them:
...@@ -169,7 +180,7 @@ class pybstr(bytes): ...@@ -169,7 +180,7 @@ class pybstr(bytes):
- if encoding and/or errors is specified, the object must provide buffer - if encoding and/or errors is specified, the object must provide buffer
interface. The data in the buffer is decoded according to provided interface. The data in the buffer is decoded according to provided
encoding/errors and further encoded via UTF-8 into bstr. encoding/errors and further encoded via UTF-8 into bstr.
- if the object is bstr/ustr / unicode/bytes - it is converted - if the object is bstr/ustr / unicode/bytes/bytearray - it is converted
to bstr. See b for details. to bstr. See b for details.
- otherwise bstr will have string representation of the object. - otherwise bstr will have string representation of the object.
...@@ -240,8 +251,8 @@ class pyustr(unicode): ...@@ -240,8 +251,8 @@ class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8. is always identity even if bytes data is not valid UTF-8.
Operations in between ustr and bstr/bytes / unicode coerce to ustr. Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
When the coercion happens, bytes, similarly to bstr, are also When the coercion happens, bytes and bytearray, similarly to bstr, are also
treated as UTF8-encoded strings. treated as UTF8-encoded strings.
ustr constructor, similarly to the one in bstr, accepts arbitrary objects ustr constructor, similarly to the one in bstr, accepts arbitrary objects
...@@ -379,6 +390,8 @@ def pyqq(obj): ...@@ -379,6 +390,8 @@ def pyqq(obj):
cdef _bstringify(object obj): # -> unicode|bytes cdef _bstringify(object obj): # -> unicode|bytes
if type(obj) in (pybstr, pyustr, bytes, unicode): if type(obj) in (pybstr, pyustr, bytes, unicode):
return obj return obj
if type(obj) is bytearray:
return bytes(obj)
if PY_MAJOR_VERSION >= 3: if PY_MAJOR_VERSION >= 3:
return unicode(obj) return unicode(obj)
...@@ -440,6 +453,39 @@ if PY_MAJOR_VERSION < 3: ...@@ -440,6 +453,39 @@ if PY_MAJOR_VERSION < 3:
_() _()
# patch:
#
# - bytearray.__init__ to accept ustr instead of raising 'TypeError:
# string argument without an encoding' (pybug: bytearray() should respect
# __bytes__ similarly to bytes)
cdef initproc _bytearray_tp_init = (<_XPyTypeObject*>bytearray) .tp_init
cdef int _bytearray_tp_xinit(object self, PyObject* args, PyObject* kw) except -1:
if args != NULL and (kw == NULL or (not <object>kw)):
argv = <object>args
if isinstance(argv, tuple) and len(argv) == 1:
arg = argv[0]
if isinstance(arg, pyustr):
argv = (pyb(arg),) # NOTE argv is kept alive till end of function
args = <PyObject*>argv # no need to incref it
return _bytearray_tp_init(self, args, kw)
def _bytearray_x__init__(self, *argv, **kw):
# NOTE don't return - just call: __init__ should return None
_bytearray_tp_xinit(self, <PyObject*>argv, <PyObject*>kw)
def _():
cdef PyTypeObject* t
for pyt in [bytearray] + bytearray.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
t_ = <_XPyTypeObject*>t
if t_.tp_init == _bytearray_tp_init:
t_.tp_init = _bytearray_tp_xinit
_patch_slot(t, '__init__', _bytearray_x__init__)
_()
# _patch_slot installs func_or_descr into typ's __dict__ as name. # _patch_slot installs func_or_descr into typ's __dict__ as name.
# #
# if func_or_descr is descriptor (has __get__), it is installed as is. # if func_or_descr is descriptor (has __get__), it is installed as is.
......
...@@ -35,8 +35,11 @@ import array ...@@ -35,8 +35,11 @@ import array
# buftypes lists types with buffer interface that we will test against. # buftypes lists types with buffer interface that we will test against.
#
# NOTE bytearray is not included here - being bytes-like object it is handled
# and tested explicitly in tests that exercise interaction of bstr/ustr with
# bytes/unicode and bytearray.
buftypes = [ buftypes = [
bytearray,
memoryview, memoryview,
lambda x: array.array('B', x), lambda x: array.array('B', x),
] ]
...@@ -125,7 +128,7 @@ def test_strings_basic(): ...@@ -125,7 +128,7 @@ def test_strings_basic():
assert ub_tunicode_ == tunicode assert ub_tunicode_ == tunicode
# b/u accept only ~bytes/~unicode # b/u accept only ~bytes/~unicode/bytearray
with raises(TypeError): b() with raises(TypeError): b()
with raises(TypeError): u() with raises(TypeError): u()
with raises(TypeError): b(123) with raises(TypeError): b(123)
...@@ -149,6 +152,7 @@ def test_strings_basic(): ...@@ -149,6 +152,7 @@ def test_strings_basic():
b_ = xbytes ("мир"); assert type(b_) is bytes b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode u_ = xunicode ("мир"); assert type(u_) is unicode
ba_ = xbytearray("мир"); assert type(ba_) is bytearray
# b/u from unicode # b/u from unicode
bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr
...@@ -162,11 +166,15 @@ def test_strings_basic(): ...@@ -162,11 +166,15 @@ def test_strings_basic():
_ = bstr(b_); assert type(_) is bstr; assert _ == "мир" _ = bstr(b_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(b_); assert type(_) is ustr; assert _ == "мир" _ = ustr(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray? # b/u from bytearray
_ = b(ba_); assert type(_) is bstr; assert _ == "мир"
_ = u(ba_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(ba_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(ba_); assert type(_) is ustr; assert _ == "мир"
# bstr/ustr from bytes/buffer with encoding # bstr/ustr from bytes/bytearray/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r') k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes] + buftypes: for tbuf in [bytes, bytearray] + buftypes:
k8mir = tbuf(k8mir_bytes) k8mir = tbuf(k8mir_bytes)
_ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир" _ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир"
_ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир" _ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир"
...@@ -174,10 +182,10 @@ def test_strings_basic(): ...@@ -174,10 +182,10 @@ def test_strings_basic():
with raises(UnicodeDecodeError): ustr(k8mir, 'ascii') with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
_ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd' _ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd'
_ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd' _ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd'
# no encoding -> utf8 with surrogateescape for bytes, stringify for the rest # no encoding -> utf8 with surrogateescape for bytes/bytearray, stringify for the rest
k8mir_usurrogateescape = u'\udccd\udcc9\udcd2' k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
k8mir_strok = k8mir_usurrogateescape k8mir_strok = k8mir_usurrogateescape
if not tbuf in (bytes,): if not tbuf in (bytes, bytearray):
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok _ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok _ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
...@@ -202,6 +210,10 @@ def test_strings_basic(): ...@@ -202,6 +210,10 @@ def test_strings_basic():
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир" _ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир" _ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# bytearray(b|u) -> bytearray
_ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
_ = bytearray(us); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
# b(u(·)), u(b(·)) # b(u(·)), u(b(·))
_ = b(us); assert type(_) is bstr; assert _ == "мир" _ = b(us); assert type(_) is bstr; assert _ == "мир"
_ = u(bs); assert type(_) is ustr; assert _ == "мир" _ = u(bs); assert type(_) is ustr; assert _ == "мир"
...@@ -226,14 +238,14 @@ def test_strings_basic(): ...@@ -226,14 +238,14 @@ def test_strings_basic():
# verify string operations like `x + y` for all combinations of pairs from # verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr and ustr. Except if both x and y are std # bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled # python types, e.g. (bytes, unicode), because those combinations are handled
# only by builtin python code and might be rejected. # only by builtin python code and might be rejected.
@mark.parametrize('tx', (bytes, unicode, bstr, ustr)) @mark.parametrize('tx', (bytes, unicode, bstr, ustr, bytearray))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr)) @mark.parametrize('ty', (bytes, unicode, bstr, ustr, bytearray))
def test_strings_ops2(tx, ty): def test_strings_ops2(tx, ty):
# skip e.g. regular bytes vs regular unicode # skip e.g. regular bytes vs regular unicode
tstd = {bytes, unicode} tstd = {bytes, unicode, bytearray}
if tx in tstd and ty in tstd and tx is not ty: if tx in tstd and ty in tstd and tx is not ty:
skip() skip()
...@@ -383,6 +395,7 @@ def test_qq(): ...@@ -383,6 +395,7 @@ def test_qq():
# verify that what we patched stay unaffected when # verify that what we patched stay unaffected when
# called outside of bstr/ustr context. # called outside of bstr/ustr context.
def test_strings_patched_transparently(): def test_strings_patched_transparently():
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode u_ = xunicode ("мир"); assert type(u_) is unicode
# unicode comparison stay unaffected # unicode comparison stay unaffected
...@@ -401,6 +414,25 @@ def test_strings_patched_transparently(): ...@@ -401,6 +414,25 @@ def test_strings_patched_transparently():
assert (u_ <= u2) is False ; assert (u2 <= u_) is True assert (u_ <= u2) is False ; assert (u2 <= u_) is True
assert (u_ >= u2) is True ; assert (u2 >= u_) is False assert (u_ >= u2) is True ; assert (u2 >= u_) is False
# bytearray.__init__ stay unaffected
with raises(TypeError): bytearray(u'мир')
a = bytearray()
with raises(TypeError): a.__init__(u'мир')
def _(*argv):
a = bytearray(*argv)
b = bytearray(); _ = b.__init__(*argv); assert _ is None
ra = repr(a)
rb = repr(b)
assert ra == rb
return ra
assert _() == r"bytearray(b'')"
assert _(b_) == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(u_, 'utf-8') == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
assert _(3) == r"bytearray(b'\x00\x00\x00')"
assert _((1,2,3)) == r"bytearray(b'\x01\x02\x03')"
# ---- benchmarks ---- # ---- benchmarks ----
...@@ -431,10 +463,11 @@ def bench_bencode(b): ...@@ -431,10 +463,11 @@ def bench_bencode(b):
# ---- misc ---- # ---- misc ----
# xbytes/xunicode convert provided bytes/unicode object to bytes or # xbytes/xunicode/xbytearray convert provided bytes/unicode object to bytes,
# unicode correspondingly to function name. # unicode or bytearray correspondingly to function name.
def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x
def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x
def xbytearray(x): return bytearray(xbytes(x))
# xstr returns string corresponding to specified type and data. # xstr returns string corresponding to specified type and data.
def xstr(text, typ): def xstr(text, typ):
...@@ -442,6 +475,7 @@ def xstr(text, typ): ...@@ -442,6 +475,7 @@ def xstr(text, typ):
t = { t = {
bytes: xbytes, bytes: xbytes,
unicode: xunicode, unicode: xunicode,
bytearray: xbytearray,
bstr: b, bstr: b,
ustr: u, ustr: u,
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment