golang_str: Teach bstr/ustr to compare wrt any string with automatic coercion

So that e.g. `bstr == <any string type>` works. We want `bstr == ustr` to work because we intend those types to be interoperable. We also want e.g. `bstr == "a_string"` to work because we want bstr to be interoperable with standard strings. In general we want to have full automatic interoperability with all string types, so that e.g. `bstr == X` works for X being all bstr, ustr, unicode, bytes (and later bytearray). For now we add support only for comparison operators. But later, we will be adding support for e.g. +, string methods, etc - and in all those operations we will be following the same approach: to have automatic interoperability with all string types out of the box. The text added to README reflects this. The patch to unicode.tp_richcompare on py2 illustrates our approach to adjust builtin types when absolutely needed. In this particular case original builtin unicode.__eq__(unicode, bstr) is always returning False for non-ASCII bstr even despite bstr having .__unicode__() method. Our adjustment is non-intrusive - we adjust unicode behaviour only wrt bstr and it stays exactly the same as before wrt all other types. We anyway do that with care and add a test that verifies that behaviour of what we patched stays unaffected when used outside of bstr/ustr context.

golang_str: Teach bstr/ustr to compare wrt any string with automatic coercion
So that e.g. `bstr == <any string type>` works. We want `bstr == ustr` to work because we intend those types to be interoperable. We also want e.g. `bstr == "a_string"` to work because we want bstr to be interoperable with standard strings. In general we want to have full automatic interoperability with all string types, so that e.g. `bstr == X` works for X being all bstr, ustr, unicode, bytes (and later bytearray). For now we add support only for comparison operators. But later, we will be adding support for e.g. +, string methods, etc - and in all those operations we will be following the same approach: to have automatic interoperability with all string types out of the box. The text added to README reflects this. The patch to unicode.tp_richcompare on py2 illustrates our approach to adjust builtin types when absolutely needed. In this particular case original builtin unicode.__eq__(unicode, bstr) is always returning False for non-ASCII bstr even despite bstr having .__unicode__() method. Our adjustment is non-intrusive - we adjust unicode behaviour only wrt bstr and it stays exactly the same as before wrt all other types. We anyway do that with care and add a test that verifies that behaviour of what we patched stays unaffected when used outside of bstr/ustr context.
54c2a3cf · Kirill Smelkov · 34667355 · 54c2a3cf · 54c2a3cf · 54c2a3cf
Commit 54c2a3cf authored Oct 05, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 265 additions and 10 deletions

README.rst README.rst +5 -0

golang/_golang_str.pyx golang/_golang_str.pyx +116 -1

golang/golang_str_test.py golang/golang_str_test.py +144 -9

No files found.
--- a/README.rst
+++ b/README.rst
@@ -240,6 +240,11 @@ The conversion, in both encoding and decoding, never fails and never looses
 information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
 even if bytes data is not valid UTF-8.

+Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while
+operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
+to `ustr`.  When the coercion happens, `bytes`, similarly to
+`bstr`, are also treated as UTF8-encoded strings.
+
 `bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For
 cases when no stringification is desired, and one only wants to convert
 `bstr`/`ustr` / `unicode`/`bytes`

--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -24,7 +24,8 @@ It is included from _golang.pyx .

 from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
 from cpython cimport PyUnicode_DecodeUTF8
-from cpython cimport PyTypeObject, Py_TYPE
+from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc
+from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
 cdef extern from "Python.h":
    void PyType_Modified(PyTypeObject *)

@@ -93,6 +94,25 @@ def pyu(s): # -> ustr
    return pyustr(s)


+# _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
+cdef _pyb_coerce(x):  # -> bstr|bytes
+    if isinstance(x, bytes):
+        return x
+    elif isinstance(x, unicode):
+        return pyb(x)
+    else:
+        raise TypeError("b: coerce: invalid type %s" % type(x))
+
+# _pyu_coerce coerces x from `u op x` to be used in operation with pyu.
+cdef _pyu_coerce(x):  # -> ustr|unicode
+    if isinstance(x, unicode):
+        return x
+    elif isinstance(x, bytes):
+        return pyu(x)
+    else:
+        raise TypeError("u: coerce: invalid type %s" % type(x))
+
+
 # __pystr converts obj to ~str of current python:
 #
 #   - to ~bytes,   via b, if running on py2, or
@@ -123,6 +143,10 @@ class pybstr(bytes):

    is always identity even if bytes data is not valid UTF-8.

+    Operations in between bstr and ustr/unicode / bytes coerce to bstr.
+    When the coercion happens, bytes, similarly to bstr, are also
+    treated as UTF8-encoded strings.
+
    See also: b, ustr/u.
    """

@@ -141,6 +165,32 @@ class pybstr(bytes):
            return self


+    def __hash__(self):
+        # hash of the same unicode and UTF-8 encoded bytes is generally different
+        # -> we can't make hash(bstr) == both hash(bytes) and hash(unicode) at the same time.
+        # -> make hash(bstr) == hash(str type of current python) so that bstr
+        #    could be used as keys in dictionary interchangeably with native str type.
+        if PY_MAJOR_VERSION >= 3:
+            return hash(pyu(self))
+        else:
+            return bytes.__hash__(self)
+
+    # == != < > <= >=
+    # NOTE == and != are special: they must succeed against any type so that
+    # bstr could be used as dict key.
+    def __eq__(a, b):
+        try:
+            b = _pyb_coerce(b)
+        except TypeError:
+            return False
+        return bytes.__eq__(a, b)
+    def __ne__(a, b):   return not a.__eq__(b)
+    def __lt__(a, b):   return bytes.__lt__(a, _pyb_coerce(b))
+    def __gt__(a, b):   return bytes.__gt__(a, _pyb_coerce(b))
+    def __le__(a, b):   return bytes.__le__(a, _pyb_coerce(b))
+    def __ge__(a, b):   return bytes.__ge__(a, _pyb_coerce(b))
+
+
 cdef class pyustr(unicode):
    """ustr is unicode-string.

@@ -151,6 +201,10 @@ cdef class pyustr(unicode):

    is always identity even if bytes data is not valid UTF-8.

+    Operations in between ustr and bstr/bytes / unicode coerce to ustr.
+    When the coercion happens, bytes, similarly to bstr, are also
+    treated as UTF8-encoded strings.
+
    See also: u, bstr/b.
    """

@@ -164,6 +218,29 @@ cdef class pyustr(unicode):
            return pyb(self)


+    def __hash__(self):
+        # see pybstr.__hash__ for why we stick to hash of current str
+        if PY_MAJOR_VERSION >= 3:
+            return unicode.__hash__(self)
+        else:
+            return hash(pyb(self))
+
+    # == != < > <= >=
+    # NOTE == and != are special: they must succeed against any type so that
+    # ustr could be used as dict key.
+    def __eq__(a, b):
+        try:
+            b = _pyu_coerce(b)
+        except TypeError:
+            return False
+        return unicode.__eq__(a, b)
+    def __ne__(a, b):   return not a.__eq__(b)
+    def __lt__(a, b):   return unicode.__lt__(a, _pyu_coerce(b))
+    def __gt__(a, b):   return unicode.__gt__(a, _pyu_coerce(b))
+    def __le__(a, b):   return unicode.__le__(a, _pyu_coerce(b))
+    def __ge__(a, b):   return unicode.__ge__(a, _pyu_coerce(b))
+
+
 # _bdata/_udata retrieve raw data from bytes/unicode.
 def _bdata(obj): # -> bytes
    assert isinstance(obj, bytes)
@@ -235,6 +312,44 @@ def pyqq(obj):
    return qobj


+# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
+# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
+# both arguments to Unicode - interpreting them as being unequal`, and that
+# further `a == b` returns False even if `b == a` gives True.
+#
+# NOTE there is no need to do the same for ustr, because ustr inherits from
+# unicode and can be always natively converted to unicode by python itself.
+cdef richcmpfunc _unicode_tp_richcompare = Py_TYPE(u'').tp_richcompare
+
+cdef object _unicode_tp_xrichcompare(object a, object b, int op):
+    if isinstance(b, pybstr):
+        return NotImplemented
+    return _unicode_tp_richcompare(a, b, op)
+
+cdef object _unicode_x__eq__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_EQ)
+cdef object _unicode_x__ne__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_NE)
+cdef object _unicode_x__lt__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_LT)
+cdef object _unicode_x__gt__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_GT)
+cdef object _unicode_x__le__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_LE)
+cdef object _unicode_x__ge__(object a, object b):   return _unicode_tp_richcompare(a, b, Py_GE)
+
+if PY_MAJOR_VERSION < 3:
+    def _():
+        cdef PyTypeObject* t
+        for pyt in [unicode] + unicode.__subclasses__():
+            assert isinstance(pyt, type)
+            t = <PyTypeObject*>pyt
+            if t.tp_richcompare == _unicode_tp_richcompare:
+                t.tp_richcompare = _unicode_tp_xrichcompare
+                _patch_slot(t, "__eq__", _unicode_x__eq__)
+                _patch_slot(t, "__ne__", _unicode_x__ne__)
+                _patch_slot(t, "__lt__", _unicode_x__lt__)
+                _patch_slot(t, "__gt__", _unicode_x__gt__)
+                _patch_slot(t, "__le__", _unicode_x__le__)
+                _patch_slot(t, "__ge__", _unicode_x__ge__)
+    _()
+
+
 # _patch_slot installs func_or_descr into typ's __dict__ as name.
 #
 # if func_or_descr is descriptor (has __get__), it is installed as is.

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -26,7 +26,7 @@ from golang._golang import _udata, _bdata
 from golang.gcompat import qq
 from golang.strconv_test import byterange
 from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
-from pytest import raises
+from pytest import raises, mark, skip
 import sys
 from six import text_type as unicode
 from six.moves import range as xrange
@@ -132,8 +132,8 @@ def test_strings_basic():
    us = u(u_);    assert isinstance(us, unicode);  assert type(us) is ustr

    # b/u from bytes
-    _ = b(b_);     assert type(_) is bstr
-    _ = u(b_);     assert type(_) is ustr
+    _ = b(b_);     assert type(_) is bstr;  assert _ == "мир"
+    _ = u(b_);     assert type(_) is ustr;  assert _ == "мир"

    # TODO also handle bytearray?

@@ -147,14 +147,19 @@ def test_strings_basic():
    assert unicode(us) is us

    # unicode(b) -> u,  bytes(u) -> b
-    _ = unicode(bs);  assert type(_) is ustr
-    _ = bytes  (us);  assert type(_) is bstr
+    _ = unicode(bs);  assert type(_) is ustr;  assert _ == "мир"
+    _ = bytes  (us);  assert type(_) is bstr;  assert _ == "мир"

    # b(u(·)), u(b(·))
-    _ = b(us);    assert type(_) is bstr
-    _ = u(bs);    assert type(_) is ustr
-    _ = bstr(us); assert type(_) is bstr
-    _ = ustr(bs); assert type(_) is ustr
+    _ = b(us);    assert type(_) is bstr;  assert _ == "мир"
+    _ = u(bs);    assert type(_) is ustr;  assert _ == "мир"
+    _ = bstr(us); assert type(_) is bstr;  assert _ == "мир"
+    _ = ustr(bs); assert type(_) is ustr;  assert _ == "мир"
+
+    # hash of b/u is made to be equal to hash of current str
+    # (it cannot be equal to hash(b'мир') and hash(u'мир') at the same time as those hashes differ)
+    assert hash(us) == hash("мир");  assert us == "мир"
+    assert hash(bs) == hash("мир");  assert bs == "мир"

    # str
    _ = str(us);   assert isinstance(_, str);  assert _ == "мир"
@@ -167,6 +172,98 @@ def test_strings_basic():
        with raises(AttributeError):
            bs.hello = 1

+
+# verify string operations like `x + y` for all combinations of pairs from
+# bytes, unicode, bstr and ustr. Except if both x and y are std
+# python types, e.g. (bytes, unicode), because those combinations are handled
+# only by builtin python code and might be rejected.
+@mark.parametrize('tx', (bytes, unicode, bstr, ustr))
+@mark.parametrize('ty', (bytes, unicode, bstr, ustr))
+def test_strings_ops2(tx, ty):
+    # skip e.g. regular bytes vs regular unicode
+    tstd = {bytes, unicode}
+    if tx in tstd  and  ty in tstd  and  tx is not ty:
+        skip()
+
+    # == != <= >= < >   for ~equal
+    x = xstr(u'мир', tx);  assert type(x) is tx
+    y = xstr(u'мир', ty);  assert type(y) is ty
+    assert      x == y
+    assert      y == x
+    assert not (x != y)
+    assert not (y != x)
+    assert      x >= y
+    assert      y >= x
+    assert      x <= y
+    assert      y <= x
+    assert not (x > y)
+    assert not (y > x)
+    assert not (x < y)
+    assert not (y < x)
+
+    # now not equal
+    x = xstr(u'hello ', tx)
+    y = xstr(u'мир',    ty)
+
+    # == != <= >= < >
+    assert not (x == y)
+    assert not (y == x)
+    assert      x != y
+    assert      y != x
+    assert not (x >= y)
+    assert      y >= x
+    assert      x <= y
+    assert not (y <= x)
+    assert      x < y
+    assert not (y < x)
+    assert not (x > y)
+    assert      y > x
+
+
+# verify string operations like `x == *` for x being bstr/ustr.
+# Those operations must succeed for any hashable type or else bstr/ustr could
+# not be used as dict keys.
+@mark.parametrize('tx', (bstr, ustr))
+def test_strings_ops2_eq_any(tx):
+    x = xstr(u'мир', tx)
+    while 1:
+        hx = hash(x)
+        if hash(hx) == hx:  # positive int32 will have this property
+            break
+        x += xstr('!', tx)
+
+    # assertNE asserts that (x==y) is False and (x!=y) is True.
+    # it also asserts that e.g. x < y raises TypeError
+    def assertNE(y):
+        assert (x == y) is False
+        assert (x != y) is True
+        with raises(TypeError): x >= y
+        with raises(TypeError): x <= y
+        with raises(TypeError): x >  y
+        with raises(TypeError): x <  y
+    _ = assertNE
+
+    _(None)
+    _(0)
+    _(1)
+    _(2)
+
+    assert hash(x)  == hx
+    assert hash(hx) == hx
+    _(hx)
+    d = {x: 1, hx: 2}    # creating dict will fail if `x == hx` raises TypeError
+    assert d[x]  == 1
+    assert d[hx] == 2
+
+    _(())
+    _((1,))
+    _((x,))
+
+    # == wrt non-hashable type also succeeds following std python where e.g. 's' == [1] gives False
+    l = [1]
+    with raises(TypeError): hash(l)
+    _(l)
+
 # verify print for bstr/ustr.
 def test_strings_print():
    outok = readfile(dir_testprog + "/golang_test_str.txt")
@@ -191,6 +288,30 @@ def test_qq():
    assert u'hello %s !' % qq(b('мир')) ==  u'hello "мир" !'    # u % qq(b) -> u


+# ----------------------------------------
+
+# verify that what we patched stay unaffected when
+# called outside of bstr/ustr context.
+def test_strings_patched_transparently():
+    u_  = xunicode  ("мир");  assert type(u_)  is unicode
+
+    # unicode comparison stay unaffected
+    assert (u_ == u_)  is True
+    assert (u_ != u_)  is False
+    assert (u_ <  u_)  is False
+    assert (u_ >  u_)  is False
+    assert (u_ <= u_)  is True
+    assert (u_ >= u_)  is True
+
+    u2 = xunicode("май");  assert type(u2) is unicode
+    assert (u_ == u2)  is False     ; assert (u2 == u_)  is False
+    assert (u_ != u2)  is True      ; assert (u2 != u_)  is True
+    assert (u_ <  u2)  is False     ; assert (u2 <  u_)  is True
+    assert (u_ >  u2)  is True      ; assert (u2 >  u_)  is False
+    assert (u_ <= u2)  is False     ; assert (u2 <= u_)  is True
+    assert (u_ >= u2)  is True      ; assert (u2 >= u_)  is False
+
+
 # ---- benchmarks ----

 # utf-8 decoding
@@ -224,3 +345,17 @@ def bench_bencode(b):
 # unicode correspondingly to function name.
 def xbytes(x):     return x.encode('utf-8') if type(x) is unicode else x
 def xunicode(x):   return x.decode('utf-8') if type(x) is bytes   else x
+
+# xstr returns string corresponding to specified type and data.
+def xstr(text, typ):
+    def _():
+        t = {
+            bytes:      xbytes,
+            unicode:    xunicode,
+            bstr:       b,
+            ustr:       u,
+        }
+        return t[typ](text)
+    s = _()
+    assert type(s) is typ
+    return s