golang_str: Teach b/u to accept objects with buffer interface

And to convert them to bstr/ustr decoding buffer data as if it was bytes. This is needed if e.g. we have data in mmap or numpy.ndarray, and want to convert the data to string. The conversion is always explicit via explicit call to b/u. And for bstr/ustr constructors, we preserver their behaviour to match unicode constructor not to convert automatically, but instead to stringify the object, e.g. as shown below: In [1]: bdata = b'hello 123' In [2]: mview = memoryview(bdata) In [3]: str(mview) Out[3]: '<memory at 0x7fb226b26700>' # NOTE _not_ b'hello 123'

golang_str: Teach b/u to accept objects with buffer interface
And to convert them to bstr/ustr decoding buffer data as if it was bytes. This is needed if e.g. we have data in mmap or numpy.ndarray, and want to convert the data to string. The conversion is always explicit via explicit call to b/u. And for bstr/ustr constructors, we preserver their behaviour to match unicode constructor not to convert automatically, but instead to stringify the object, e.g. as shown below: In [1]: bdata = b'hello 123' In [2]: mview = memoryview(bdata) In [3]: str(mview) Out[3]: '<memory at 0x7fb226b26700>' # NOTE _not_ b'hello 123'
d7e55bb0 · Kirill Smelkov · e4d5cb21 · d7e55bb0 · d7e55bb0 · d7e55bb0
Commit d7e55bb0 authored Oct 07, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 11 deletions

README.rst README.rst +5 -3

golang/_golang_str.pyx golang/_golang_str.pyx +21 -7

golang/golang_str_test.py golang/golang_str_test.py +37 -1

No files found.
--- a/README.rst
+++ b/README.rst
@@ -249,8 +249,8 @@ to `ustr`.  When the coercion happens, `bytes` and `bytearray`, similarly to
 `str`/`unicode` classes. They support all methods of `str`/`unicode` and in
 particular their constructors accept arbitrary objects and either convert or stringify them. For
 cases when no stringification is desired, and one only wants to convert
-`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`
-to Pygolang string, `b` and `u` provide way to make sure an
+`bstr`/`ustr` / `unicode`/`bytes`/`bytearray`, or an object with `buffer`
+interface [*]_, to Pygolang string, `b` and `u` provide way to make sure an
 object is either `bstr` or `ustr` correspondingly.

 Usage example::
@@ -258,10 +258,12 @@ Usage example::
   s  = b('привет')     # s is bstr corresponding to UTF-8 encoding of 'привет'.

   def f(s):
-      s = u(s)          # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes or bytearray.
+      s = u(s)          # make sure s is ustr, decoding as UTF-8(*) if it was bstr, bytes, bytearray or buffer.
      ...               # (*) the decoding never fails nor looses information.

 .. [*] `unicode` on Python2, `str` on Python3.
+.. [*] | data in buffer, similarly to `bytes` and `bytearray`, is treated as UTF8-encoded string.
+       | Notice that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not.


 Import

--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -26,6 +26,7 @@ from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnico
 from cpython cimport PyUnicode_DecodeUTF8
 from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc
 from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
+from cpython cimport PyObject_CheckBuffer
 cdef extern from "Python.h":
    void PyType_Modified(PyTypeObject *)

@@ -44,7 +45,7 @@ def pyb(s): # -> bstr
    """b converts object to bstr.

       - For bstr the same object is returned.
-       - For bytes or bytearray the data is
+       - For bytes, bytearray, or object with buffer interface, the data is
         preserved as-is and only result type is changed to bstr.
       - For ustr/unicode the data is UTF-8 encoded. The encoding always succeeds.

@@ -66,7 +67,7 @@ def pyu(s): # -> ustr

       - For ustr the same object is returned.
       - For unicode the data is preserved as-is and only result type is changed to ustr.
-       - For bstr, bytes or bytearray the data is UTF-8 decoded.
+       - For bstr, bytes, bytearray, or object with buffer interface, the data is UTF-8 decoded.
         The decoding always succeeds and input
         information is not lost: non-valid UTF-8 bytes are decoded into
         surrogate codes ranging from U+DC80 to U+DCFF.
@@ -95,9 +96,8 @@ cdef _pyb(bcls, s): # -> ~bstr | None
    elif isinstance(s, unicode):
        s = _utf8_encode_surrogateescape(s)
    else:
-        if isinstance(s, bytearray):
-            s = bytes(s)
-        else:
+        s = _ifbuffer_data(s) # bytearray and buffer
+        if s is None:
            return None

    assert type(s) is bytes
@@ -111,8 +111,9 @@ cdef _pyu(ucls, s): # -> ~ustr | None
        if type(s) is not unicode:
            s = _udata(s)
    else:
-        if isinstance(s, bytearray):
-            s = bytes(s)
+        _ = _ifbuffer_data(s) # bytearray and buffer
+        if _ is not None:
+            s = _
        if isinstance(s, bytes):
            s = _utf8_decode_surrogateescape(s)
        else:
@@ -121,6 +122,19 @@ cdef _pyu(ucls, s): # -> ~ustr | None
    assert type(s) is unicode
    return unicode.__new__(ucls, s)

+# _ifbuffer_data returns contained data if obj provides buffer interface.
+cdef _ifbuffer_data(obj): # -> bytes|None
+    if PyObject_CheckBuffer(obj):
+        if PY_MAJOR_VERSION >= 3:
+            return bytes(obj)
+        else:
+            # py2: bytes(memoryview)  returns  '<memory at ...>'
+            return bytes(bytearray(obj))
+    elif _XPyObject_CheckOldBuffer(obj):  # old-style buffer, py2-only
+        return bytes(_buffer_py2(obj))
+    else:
+        return None
+

 # _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
 cdef _pyb_coerce(x):  # -> bstr|bytes

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -128,7 +128,7 @@ def test_strings_basic():
        assert ub_tunicode_ == tunicode


-    # b/u accept only ~bytes/~unicode/bytearray
+    # b/u accept only ~bytes/~unicode/bytearray/buffer
    with raises(TypeError): b()
    with raises(TypeError): u()
    with raises(TypeError): b(123)
@@ -172,6 +172,15 @@ def test_strings_basic():
    _ = bstr(ba_); assert type(_) is bstr;  assert _ == "мир"
    _ = ustr(ba_); assert type(_) is ustr;  assert _ == "мир"

+    # b/u from buffer
+    for tbuf in buftypes:
+        bbuf_ = tbuf(b_)
+        bbuf_std_str = str(bbuf_)   # e.g. '<memory at ...>' for memoryview
+        _ = b(bbuf_);    assert type(_) is bstr;  assert _ == "мир"
+        _ = u(bbuf_);    assert type(_) is ustr;  assert _ == "мир"
+        _ = bstr(bbuf_); assert type(_) is bstr;  assert _ == bbuf_std_str # NOTE not 'мир'
+        _ = ustr(bbuf_); assert type(_) is ustr;  assert _ == bbuf_std_str
+
    # bstr/ustr from bytes/bytearray/buffer with encoding
    k8mir_bytes = u"мир".encode('koi8-r')
    for tbuf in [bytes, bytearray] + buftypes:
@@ -189,6 +198,8 @@ def test_strings_basic():
            k8mir_strok = str(k8mir)  # e.g. '<memory at ...>' for memoryview
        _ = bstr(k8mir);  assert type(_) is bstr;  assert _ == k8mir_strok
        _ = ustr(k8mir);  assert type(_) is ustr;  assert _ == k8mir_strok
+        _ = b   (k8mir);  assert type(_) is bstr;  assert _ == k8mir_usurrogateescape # always surrogateescape
+        _ = u   (k8mir);  assert type(_) is ustr;  assert _ == k8mir_usurrogateescape
        # encoding specified -> treat it precisely
        with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
        with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
@@ -284,6 +295,31 @@ def test_strings_ops2(tx, ty):
    assert      y > x


+# verify string operations like `x + y` for x being bstr/ustr and y being a
+# type unsupported for coercion.
+@mark.parametrize('tx', (bstr, ustr))
+@mark.parametrize('ty', buftypes)
+def test_strings_ops2_bufreject(tx, ty):
+    x = xstr(u'мир', tx)
+    y = ty(b'123')
+
+    assert  (x == y) is False           # see test_strings_ops2_eq_any
+    assert  (x != y) is True
+    with raises(TypeError):     x >= y
+    with raises(TypeError):     x <= y
+    with raises(TypeError):     x >  y
+    with raises(TypeError):     x <  y
+
+    # `y > x` does not raise when x is bstr (= provides buffer):
+    y == x  # not raises TypeError  -  see test_strings_ops2_eq_any
+    y != x  #
+    if tx is not bstr:
+        with raises(TypeError):     y >= x
+        with raises(TypeError):     y <= x
+        with raises(TypeError):     y >  x
+        with raises(TypeError):     y <  x
+
+
 # verify string operations like `x == *` for x being bstr/ustr.
 # Those operations must succeed for any hashable type or else bstr/ustr could
 # not be used as dict keys.