golang_str: Treat bytearray also as bytestring, just mutable

bytearray was introduced in Python as a mutable version of bytes. It has all strings methods (e.g. .capitalize() .islower(), etc), and it also supports % formatting. In other words it has all attributes of being a byte-string, with the only difference from bytes in that bytearray is mutable. In other words bytearray is handy to have when a string is being incrementally constructed step by step without hitting overhead of many bytes objects creation/destruction. So, since bytearray is also a bytestring, similarly to bytes, let's add support to interoperate with bytearray to bstr and ustr: - b/u and bstr/ustr now accept bytearray as argument and treat it as bytestring. - bytearray() constructor, similarly to bytes() and unicode() constructors, now also accepts bstr/ustr and create bytearray object corresponding to byte-stream of input. For the latter point to work we need to patch bytearray.__init__() a bit, since, contrary to bytes.__init__(), it does not pay attention to whether provided argument has __bytes__ method or not.

golang_str: Treat bytearray also as bytestring, just mutable
bytearray was introduced in Python as a mutable version of bytes. It has all strings methods (e.g. .capitalize() .islower(), etc), and it also supports % formatting. In other words it has all attributes of being a byte-string, with the only difference from bytes in that bytearray is mutable. In other words bytearray is handy to have when a string is being incrementally constructed step by step without hitting overhead of many bytes objects creation/destruction. So, since bytearray is also a bytestring, similarly to bytes, let's add support to interoperate with bytearray to bstr and ustr: - b/u and bstr/ustr now accept bytearray as argument and treat it as bytestring. - bytearray() constructor, similarly to bytes() and unicode() constructors, now also accepts bstr/ustr and create bytearray object corresponding to byte-stream of input. For the latter point to work we need to patch bytearray.__init__() a bit, since, contrary to bytes.__init__(), it does not pay attention to whether provided argument has __bytes__ method or not.
e4d5cb21 · Kirill Smelkov · 781802d4 · e4d5cb21 · e4d5cb21 · e4d5cb21
Commit e4d5cb21 authored Oct 07, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 111 additions and 31 deletions

README.rst README.rst +5 -5

golang/_golang_str.pyx golang/_golang_str.pyx +59 -13

golang/golang_str_test.py golang/golang_str_test.py +47 -13

No files found.
--- a/README.rst
+++ b/README.rst
@@ -240,16 +240,16 @@ The conversion, in both encoding and decoding, never fails and never looses
 information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
 even if bytes data is not valid UTF-8.
-Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while
+Operations in between `bstr` and `ustr`/`unicode` / `bytes`/`bytearray` coerce to `bstr`, while
-operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
+operations in between `ustr` and `bstr`/`bytes`/`bytearray` / `unicode` coerce
-to `ustr`.  When the coercion happens, `bytes`, similarly to
+to `ustr`.  When the coercion happens, `bytes` and `bytearray`, similarly to
 `bstr`, are also treated as UTF8-encoded strings.
 `bstr` and `ustr` are meant to be drop-in replacements for standard
 `str`/`unicode` classes. They support all methods of `str`/`unicode` and in
 particular their constructors accept arbitrary objects and either convert or stringify them. For
 cases when no stringification is desired, and one only wants to convert
-`bstr`/`ustr` / `unicode`/`bytes`
+`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`
 to Pygolang string, `b` and `u` provide way to make sure an
 object is either `bstr` or `ustr` correspondingly.
@@ -258,7 +258,7 @@ Usage example::
   s  = b('привет')     # s is bstr corresponding to UTF-8 encoding of 'привет'.
   def f(s):
-      s = u(s)          # make sure s is ustr, decoding as UTF-8(*) if it was bstr or bytes.
+      s = u(s)          # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes or bytearray.
      ...               # (*) the decoding never fails nor looses information.
 .. [*] `unicode` on Python2, `str` on Python3.

--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -29,6 +29,11 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
 cdef extern from "Python.h":
    void PyType_Modified(PyTypeObject *)
+cdef extern from "Python.h":
+    ctypedef int (*initproc)(object, PyObject *, PyObject *) except -1
+    ctypedef struct _XPyTypeObject "PyTypeObject":
+        initproc  tp_init
 from libc.stdint cimport uint8_t
 pystrconv = None  # = golang.strconv imported at runtime (see __init__.py)
@@ -39,7 +44,7 @@ def pyb(s): # -> bstr
    """b converts object to bstr.
       - For bstr the same object is returned.
-       - For bytes the data is
+       - For bytes or bytearray the data is
         preserved as-is and only result type is changed to bstr.
       - For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.
@@ -61,7 +66,7 @@ def pyu(s): # -> ustr
       - For ustr the same object is returned.
       - For unicode the data is preserved as-is and only result type is changed to ustr.
-       - For bstr or bytes the data is UTF-8 decoded.
+       - For bstr, bytes or bytearray the data is UTF-8 decoded.
         The decoding always succeeds and input
         information is not lost: non-valid UTF-8 bytes are decoded into
         surrogate codes ranging from U+DC80 to U+DCFF.
@@ -90,7 +95,10 @@ cdef _pyb(bcls, s): # -> ~bstr | None
    elif isinstance(s, unicode):
        s = _utf8_encode_surrogateescape(s)
    else:
-        return None
+        if isinstance(s, bytearray):
+            s = bytes(s)
+        else:
+            return None
    assert type(s) is bytes
    return bytes.__new__(bcls, s)
@@ -102,10 +110,13 @@ cdef _pyu(ucls, s): # -> ~ustr | None
    if isinstance(s, unicode):
        if type(s) is not unicode:
            s = _udata(s)
-    elif isinstance(s, bytes):
-        s = _utf8_decode_surrogateescape(s)
    else:
-        return None
+        if isinstance(s, bytearray):
+            s = bytes(s)
+        if isinstance(s, bytes):
+            s = _utf8_decode_surrogateescape(s)
+        else:
+            return None
    assert type(s) is unicode
    return unicode.__new__(ucls, s)
@@ -115,7 +126,7 @@ cdef _pyu(ucls, s): # -> ~ustr | None
 cdef _pyb_coerce(x):  # -> bstr|bytes
    if isinstance(x, bytes):
        return x
-    elif isinstance(x, unicode):
+    elif isinstance(x, (unicode, bytearray)):
        return pyb(x)
    else:
        raise TypeError("b: coerce: invalid type %s" % type(x))
@@ -124,7 +135,7 @@ cdef _pyb_coerce(x):  # -> bstr|bytes
 cdef _pyu_coerce(x):  # -> ustr|unicode
    if isinstance(x, unicode):
        return x
-    elif isinstance(x, bytes):
+    elif isinstance(x, (bytes, bytearray)):
        return pyu(x)
    else:
        raise TypeError("u: coerce: invalid type %s" % type(x))
@@ -160,8 +171,8 @@ class pybstr(bytes):
    is always identity even if bytes data is not valid UTF-8.
-    Operations in between bstr and ustr/unicode / bytes coerce to bstr.
+    Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr.
-    When the coercion happens, bytes, similarly to bstr, are also
+    When the coercion happens, bytes and bytearray, similarly to bstr, are also
    treated as UTF8-encoded strings.
    bstr constructor accepts arbitrary objects and stringify them:
@@ -169,7 +180,7 @@ class pybstr(bytes):
    - if encoding and/or errors is specified, the object must provide buffer
      interface. The data in the buffer is decoded according to provided
      encoding/errors and further encoded via UTF-8 into bstr.
-    - if the object is bstr/ustr / unicode/bytes - it is converted
+    - if the object is bstr/ustr / unicode/bytes/bytearray - it is converted
      to bstr. See b for details.
    - otherwise bstr will have string representation of the object.
@@ -240,8 +251,8 @@ class pyustr(unicode):
    is always identity even if bytes data is not valid UTF-8.
-    Operations in between ustr and bstr/bytes / unicode coerce to ustr.
+    Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr.
-    When the coercion happens, bytes, similarly to bstr, are also
+    When the coercion happens, bytes and bytearray, similarly to bstr, are also
    treated as UTF8-encoded strings.
    ustr constructor, similarly to the one in bstr, accepts arbitrary objects
@@ -379,6 +390,8 @@ def pyqq(obj):
 cdef _bstringify(object obj): # -> unicode|bytes
    if type(obj) in (pybstr, pyustr, bytes, unicode):
        return obj
+    if type(obj) is bytearray:
+        return bytes(obj)
    if PY_MAJOR_VERSION >= 3:
        return unicode(obj)
@@ -440,6 +453,39 @@ if PY_MAJOR_VERSION < 3:
    _()
+# patch:
+#
+# - bytearray.__init__ to accept ustr instead of raising 'TypeError:
+#   string argument without an encoding'  (pybug: bytearray() should respect
+#   __bytes__ similarly to bytes)
+cdef initproc   _bytearray_tp_init    = (<_XPyTypeObject*>bytearray) .tp_init
+cdef int _bytearray_tp_xinit(object self, PyObject* args, PyObject* kw) except -1:
+    if args != NULL  and  (kw == NULL  or  (not <object>kw)):
+        argv = <object>args
+        if isinstance(argv, tuple)  and  len(argv) == 1:
+            arg = argv[0]
+            if isinstance(arg, pyustr):
+                argv = (pyb(arg),)      # NOTE argv is kept alive till end of function
+                args = <PyObject*>argv  #      no need to incref it
+    return _bytearray_tp_init(self, args, kw)
+def _bytearray_x__init__(self, *argv, **kw):
+    # NOTE don't return - just call: __init__ should return None
+    _bytearray_tp_xinit(self, <PyObject*>argv, <PyObject*>kw)
+def _():
+    cdef PyTypeObject* t
+    for pyt in [bytearray] + bytearray.__subclasses__():
+        assert isinstance(pyt, type)
+        t = <PyTypeObject*>pyt
+        t_ = <_XPyTypeObject*>t
+        if t_.tp_init == _bytearray_tp_init:
+            t_.tp_init = _bytearray_tp_xinit
+            _patch_slot(t, '__init__', _bytearray_x__init__)
+_()
 # _patch_slot installs func_or_descr into typ's __dict__ as name.
 #
 # if func_or_descr is descriptor (has __get__), it is installed as is.

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -35,8 +35,11 @@ import array
 # buftypes lists types with buffer interface that we will test against.
+#
+# NOTE bytearray is not included here - being bytes-like object it is handled
+# and tested explicitly in tests that exercise interaction of bstr/ustr with
+# bytes/unicode and bytearray.
 buftypes = [
-        bytearray,
        memoryview,
        lambda x: array.array('B', x),
 ]
@@ -125,7 +128,7 @@ def test_strings_basic():
        assert ub_tunicode_ == tunicode
-    # b/u accept only ~bytes/~unicode
+    # b/u accept only ~bytes/~unicode/bytearray
    with raises(TypeError): b()
    with raises(TypeError): u()
    with raises(TypeError): b(123)
@@ -149,6 +152,7 @@ def test_strings_basic():
    b_  = xbytes    ("мир");  assert type(b_) is bytes
    u_  = xunicode  ("мир");  assert type(u_) is unicode
+    ba_ = xbytearray("мир");  assert type(ba_) is bytearray
    # b/u from unicode
    bs = b(u_);    assert isinstance(bs, bytes);    assert type(bs) is bstr
@@ -162,11 +166,15 @@ def test_strings_basic():
    _ = bstr(b_);  assert type(_) is bstr;  assert _ == "мир"
    _ = ustr(b_);  assert type(_) is ustr;  assert _ == "мир"
-    # TODO also handle bytearray?
+    # b/u from bytearray
+    _ = b(ba_);    assert type(_) is bstr;  assert _ == "мир"
+    _ = u(ba_);    assert type(_) is ustr;  assert _ == "мир"
+    _ = bstr(ba_); assert type(_) is bstr;  assert _ == "мир"
+    _ = ustr(ba_); assert type(_) is ustr;  assert _ == "мир"
-    # bstr/ustr from bytes/buffer with encoding
+    # bstr/ustr from bytes/bytearray/buffer with encoding
    k8mir_bytes = u"мир".encode('koi8-r')
-    for tbuf in [bytes] + buftypes:
+    for tbuf in [bytes, bytearray] + buftypes:
        k8mir = tbuf(k8mir_bytes)
        _ = bstr(k8mir, 'koi8-r');  assert type(_) is bstr;  assert _ == "мир"
        _ = ustr(k8mir, 'koi8-r');  assert type(_) is ustr;  assert _ == "мир"
@@ -174,10 +182,10 @@ def test_strings_basic():
        with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
        _ = bstr(k8mir, 'ascii', 'replace');  assert type(_) is bstr;  assert _ == u'\ufffd\ufffd\ufffd'
        _ = ustr(k8mir, 'ascii', 'replace');  assert type(_) is ustr;  assert _ == u'\ufffd\ufffd\ufffd'
-        # no encoding -> utf8 with surrogateescape for bytes,  stringify for the rest
+        # no encoding -> utf8 with surrogateescape for bytes/bytearray,  stringify for the rest
        k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
        k8mir_strok = k8mir_usurrogateescape
-        if not tbuf in (bytes,):
+        if not tbuf in (bytes, bytearray):
            k8mir_strok = str(k8mir)  # e.g. '<memory at ...>' for memoryview
        _ = bstr(k8mir);  assert type(_) is bstr;  assert _ == k8mir_strok
        _ = ustr(k8mir);  assert type(_) is ustr;  assert _ == k8mir_strok
@@ -202,6 +210,10 @@ def test_strings_basic():
    _ = unicode(bs);  assert type(_) is ustr;  assert _ == "мир"
    _ = bytes  (us);  assert type(_) is bstr;  assert _ == "мир"
+    # bytearray(b|u) -> bytearray
+    _ = bytearray(bs);  assert type(_) is bytearray;  assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
+    _ = bytearray(us);  assert type(_) is bytearray;  assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
    # b(u(·)), u(b(·))
    _ = b(us);    assert type(_) is bstr;  assert _ == "мир"
    _ = u(bs);    assert type(_) is ustr;  assert _ == "мир"
@@ -226,14 +238,14 @@ def test_strings_basic():
 # verify string operations like `x + y` for all combinations of pairs from
-# bytes, unicode, bstr and ustr. Except if both x and y are std
+# bytes, unicode, bstr, ustr and bytearray. Except if both x and y are std
 # python types, e.g. (bytes, unicode), because those combinations are handled
 # only by builtin python code and might be rejected.
-@mark.parametrize('tx', (bytes, unicode, bstr, ustr))
+@mark.parametrize('tx', (bytes, unicode, bstr, ustr, bytearray))
-@mark.parametrize('ty', (bytes, unicode, bstr, ustr))
+@mark.parametrize('ty', (bytes, unicode, bstr, ustr, bytearray))
 def test_strings_ops2(tx, ty):
    # skip e.g. regular bytes vs regular unicode
-    tstd = {bytes, unicode}
+    tstd = {bytes, unicode, bytearray}
    if tx in tstd  and  ty in tstd  and  tx is not ty:
        skip()
@@ -383,6 +395,7 @@ def test_qq():
 # verify that what we patched stay unaffected when
 # called outside of bstr/ustr context.
 def test_strings_patched_transparently():
+    b_  = xbytes    ("мир");  assert type(b_)  is bytes
    u_  = xunicode  ("мир");  assert type(u_)  is unicode
    # unicode comparison stay unaffected
@@ -401,6 +414,25 @@ def test_strings_patched_transparently():
    assert (u_ <= u2)  is False     ; assert (u2 <= u_)  is True
    assert (u_ >= u2)  is True      ; assert (u2 >= u_)  is False
+    # bytearray.__init__ stay unaffected
+    with raises(TypeError): bytearray(u'мир')
+    a = bytearray()
+    with raises(TypeError): a.__init__(u'мир')
+    def _(*argv):
+        a = bytearray(*argv)
+        b = bytearray(); _ = b.__init__(*argv); assert _ is None
+        ra = repr(a)
+        rb = repr(b)
+        assert ra == rb
+        return ra
+    assert _()              == r"bytearray(b'')"
+    assert _(b_)            == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
+    assert _(u_, 'utf-8')   == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')"
+    assert _(3)             == r"bytearray(b'\x00\x00\x00')"
+    assert _((1,2,3))       == r"bytearray(b'\x01\x02\x03')"
 # ---- benchmarks ----
@@ -431,10 +463,11 @@ def bench_bencode(b):
 # ---- misc ----
-# xbytes/xunicode convert provided bytes/unicode object to bytes or
+# xbytes/xunicode/xbytearray convert provided bytes/unicode object to bytes,
-# unicode correspondingly to function name.
+# unicode or bytearray correspondingly to function name.
 def xbytes(x):     return x.encode('utf-8') if type(x) is unicode else x
 def xunicode(x):   return x.decode('utf-8') if type(x) is bytes   else x
+def xbytearray(x): return bytearray(xbytes(x))
 # xstr returns string corresponding to specified type and data.
 def xstr(text, typ):
@@ -442,6 +475,7 @@ def xstr(text, typ):
        t = {
            bytes:      xbytes,
            unicode:    xunicode,
+            bytearray:  xbytearray,
            bstr:       b,
            ustr:       u,
        }