Commit 54c2a3cf authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Teach bstr/ustr to compare wrt any string with automatic coercion

So that e.g. `bstr == <any string type>` works. We want `bstr == ustr`
to work because we intend those types to be interoperable. We also want
e.g. `bstr == "a_string"` to work because we want bstr to be
interoperable with standard strings. In general we want to have full
automatic interoperability with all string types, so that e.g. `bstr == X`
works for X being all bstr, ustr, unicode, bytes (and later bytearray).

For now we add support only for comparison operators. But later, we
will be adding support for e.g. +, string methods, etc - and in all
those operations we will be following the same approach: to have
automatic interoperability with all string types out of the box.

The text added to README reflects this.

The patch to unicode.tp_richcompare on py2 illustrates our approach to
adjust builtin types when absolutely needed. In this particular case
original builtin unicode.__eq__(unicode, bstr) is always returning False
for non-ASCII bstr even despite bstr having .__unicode__() method. Our
adjustment is non-intrusive - we adjust unicode behaviour only wrt bstr
and it stays exactly the same as before wrt all other types.

We anyway do that with care and add a test that verifies that behaviour
of what we patched stays unaffected when used outside of bstr/ustr
context.
parent 34667355
......@@ -240,6 +240,11 @@ The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
even if bytes data is not valid UTF-8.
Operations in between `bstr` and `ustr`/`unicode` / `bytes` coerce to `bstr`, while
operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`
......
......@@ -24,7 +24,8 @@ It is included from _golang.pyx .
from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
from cpython cimport PyUnicode_DecodeUTF8
from cpython cimport PyTypeObject, Py_TYPE
from cpython cimport PyTypeObject, Py_TYPE, richcmpfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
cdef extern from "Python.h":
void PyType_Modified(PyTypeObject *)
......@@ -93,6 +94,25 @@ def pyu(s): # -> ustr
return pyustr(s)
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
cdef _pyb_coerce(x): # -> bstr|bytes
if isinstance(x, bytes):
return x
elif isinstance(x, unicode):
return pyb(x)
else:
raise TypeError("b: coerce: invalid type %s" % type(x))
# _pyu_coerce coerces x from `u op x` to be used in operation with pyu.
cdef _pyu_coerce(x): # -> ustr|unicode
if isinstance(x, unicode):
return x
elif isinstance(x, bytes):
return pyu(x)
else:
raise TypeError("u: coerce: invalid type %s" % type(x))
# __pystr converts obj to ~str of current python:
#
# - to ~bytes, via b, if running on py2, or
......@@ -123,6 +143,10 @@ class pybstr(bytes):
is always identity even if bytes data is not valid UTF-8.
Operations in between bstr and ustr/unicode / bytes coerce to bstr.
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
See also: b, ustr/u.
"""
......@@ -141,6 +165,32 @@ class pybstr(bytes):
return self
def __hash__(self):
# hash of the same unicode and UTF-8 encoded bytes is generally different
# -> we can't make hash(bstr) == both hash(bytes) and hash(unicode) at the same time.
# -> make hash(bstr) == hash(str type of current python) so that bstr
# could be used as keys in dictionary interchangeably with native str type.
if PY_MAJOR_VERSION >= 3:
return hash(pyu(self))
else:
return bytes.__hash__(self)
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# bstr could be used as dict key.
def __eq__(a, b):
try:
b = _pyb_coerce(b)
except TypeError:
return False
return bytes.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return bytes.__lt__(a, _pyb_coerce(b))
def __gt__(a, b): return bytes.__gt__(a, _pyb_coerce(b))
def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b))
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
cdef class pyustr(unicode):
"""ustr is unicode-string.
......@@ -151,6 +201,10 @@ cdef class pyustr(unicode):
is always identity even if bytes data is not valid UTF-8.
Operations in between ustr and bstr/bytes / unicode coerce to ustr.
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
See also: u, bstr/b.
"""
......@@ -164,6 +218,29 @@ cdef class pyustr(unicode):
return pyb(self)
def __hash__(self):
# see pybstr.__hash__ for why we stick to hash of current str
if PY_MAJOR_VERSION >= 3:
return unicode.__hash__(self)
else:
return hash(pyb(self))
# == != < > <= >=
# NOTE == and != are special: they must succeed against any type so that
# ustr could be used as dict key.
def __eq__(a, b):
try:
b = _pyu_coerce(b)
except TypeError:
return False
return unicode.__eq__(a, b)
def __ne__(a, b): return not a.__eq__(b)
def __lt__(a, b): return unicode.__lt__(a, _pyu_coerce(b))
def __gt__(a, b): return unicode.__gt__(a, _pyu_coerce(b))
def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b))
def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b))
# _bdata/_udata retrieve raw data from bytes/unicode.
def _bdata(obj): # -> bytes
assert isinstance(obj, bytes)
......@@ -235,6 +312,44 @@ def pyqq(obj):
return qobj
# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
# both arguments to Unicode - interpreting them as being unequal`, and that
# further `a == b` returns False even if `b == a` gives True.
#
# NOTE there is no need to do the same for ustr, because ustr inherits from
# unicode and can be always natively converted to unicode by python itself.
cdef richcmpfunc _unicode_tp_richcompare = Py_TYPE(u'').tp_richcompare
cdef object _unicode_tp_xrichcompare(object a, object b, int op):
if isinstance(b, pybstr):
return NotImplemented
return _unicode_tp_richcompare(a, b, op)
cdef object _unicode_x__eq__(object a, object b): return _unicode_tp_richcompare(a, b, Py_EQ)
cdef object _unicode_x__ne__(object a, object b): return _unicode_tp_richcompare(a, b, Py_NE)
cdef object _unicode_x__lt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LT)
cdef object _unicode_x__gt__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GT)
cdef object _unicode_x__le__(object a, object b): return _unicode_tp_richcompare(a, b, Py_LE)
cdef object _unicode_x__ge__(object a, object b): return _unicode_tp_richcompare(a, b, Py_GE)
if PY_MAJOR_VERSION < 3:
def _():
cdef PyTypeObject* t
for pyt in [unicode] + unicode.__subclasses__():
assert isinstance(pyt, type)
t = <PyTypeObject*>pyt
if t.tp_richcompare == _unicode_tp_richcompare:
t.tp_richcompare = _unicode_tp_xrichcompare
_patch_slot(t, "__eq__", _unicode_x__eq__)
_patch_slot(t, "__ne__", _unicode_x__ne__)
_patch_slot(t, "__lt__", _unicode_x__lt__)
_patch_slot(t, "__gt__", _unicode_x__gt__)
_patch_slot(t, "__le__", _unicode_x__le__)
_patch_slot(t, "__ge__", _unicode_x__ge__)
_()
# _patch_slot installs func_or_descr into typ's __dict__ as name.
#
# if func_or_descr is descriptor (has __get__), it is installed as is.
......
......@@ -26,7 +26,7 @@ from golang._golang import _udata, _bdata
from golang.gcompat import qq
from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises
from pytest import raises, mark, skip
import sys
from six import text_type as unicode
from six.moves import range as xrange
......@@ -132,8 +132,8 @@ def test_strings_basic():
us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr
# b/u from bytes
_ = b(b_); assert type(_) is bstr
_ = u(b_); assert type(_) is ustr
_ = b(b_); assert type(_) is bstr; assert _ == "мир"
_ = u(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray?
......@@ -147,14 +147,19 @@ def test_strings_basic():
assert unicode(us) is us
# unicode(b) -> u, bytes(u) -> b
_ = unicode(bs); assert type(_) is ustr
_ = bytes (us); assert type(_) is bstr
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# b(u(·)), u(b(·))
_ = b(us); assert type(_) is bstr
_ = u(bs); assert type(_) is ustr
_ = bstr(us); assert type(_) is bstr
_ = ustr(bs); assert type(_) is ustr
_ = b(us); assert type(_) is bstr; assert _ == "мир"
_ = u(bs); assert type(_) is ustr; assert _ == "мир"
_ = bstr(us); assert type(_) is bstr; assert _ == "мир"
_ = ustr(bs); assert type(_) is ustr; assert _ == "мир"
# hash of b/u is made to be equal to hash of current str
# (it cannot be equal to hash(b'мир') and hash(u'мир') at the same time as those hashes differ)
assert hash(us) == hash("мир"); assert us == "мир"
assert hash(bs) == hash("мир"); assert bs == "мир"
# str
_ = str(us); assert isinstance(_, str); assert _ == "мир"
......@@ -167,6 +172,98 @@ def test_strings_basic():
with raises(AttributeError):
bs.hello = 1
# verify string operations like `x + y` for all combinations of pairs from
# bytes, unicode, bstr and ustr. Except if both x and y are std
# python types, e.g. (bytes, unicode), because those combinations are handled
# only by builtin python code and might be rejected.
@mark.parametrize('tx', (bytes, unicode, bstr, ustr))
@mark.parametrize('ty', (bytes, unicode, bstr, ustr))
def test_strings_ops2(tx, ty):
# skip e.g. regular bytes vs regular unicode
tstd = {bytes, unicode}
if tx in tstd and ty in tstd and tx is not ty:
skip()
# == != <= >= < > for ~equal
x = xstr(u'мир', tx); assert type(x) is tx
y = xstr(u'мир', ty); assert type(y) is ty
assert x == y
assert y == x
assert not (x != y)
assert not (y != x)
assert x >= y
assert y >= x
assert x <= y
assert y <= x
assert not (x > y)
assert not (y > x)
assert not (x < y)
assert not (y < x)
# now not equal
x = xstr(u'hello ', tx)
y = xstr(u'мир', ty)
# == != <= >= < >
assert not (x == y)
assert not (y == x)
assert x != y
assert y != x
assert not (x >= y)
assert y >= x
assert x <= y
assert not (y <= x)
assert x < y
assert not (y < x)
assert not (x > y)
assert y > x
# verify string operations like `x == *` for x being bstr/ustr.
# Those operations must succeed for any hashable type or else bstr/ustr could
# not be used as dict keys.
@mark.parametrize('tx', (bstr, ustr))
def test_strings_ops2_eq_any(tx):
x = xstr(u'мир', tx)
while 1:
hx = hash(x)
if hash(hx) == hx: # positive int32 will have this property
break
x += xstr('!', tx)
# assertNE asserts that (x==y) is False and (x!=y) is True.
# it also asserts that e.g. x < y raises TypeError
def assertNE(y):
assert (x == y) is False
assert (x != y) is True
with raises(TypeError): x >= y
with raises(TypeError): x <= y
with raises(TypeError): x > y
with raises(TypeError): x < y
_ = assertNE
_(None)
_(0)
_(1)
_(2)
assert hash(x) == hx
assert hash(hx) == hx
_(hx)
d = {x: 1, hx: 2} # creating dict will fail if `x == hx` raises TypeError
assert d[x] == 1
assert d[hx] == 2
_(())
_((1,))
_((x,))
# == wrt non-hashable type also succeeds following std python where e.g. 's' == [1] gives False
l = [1]
with raises(TypeError): hash(l)
_(l)
# verify print for bstr/ustr.
def test_strings_print():
outok = readfile(dir_testprog + "/golang_test_str.txt")
......@@ -191,6 +288,30 @@ def test_qq():
assert u'hello %s !' % qq(b('мир')) == u'hello "мир" !' # u % qq(b) -> u
# ----------------------------------------
# verify that what we patched stay unaffected when
# called outside of bstr/ustr context.
def test_strings_patched_transparently():
u_ = xunicode ("мир"); assert type(u_) is unicode
# unicode comparison stay unaffected
assert (u_ == u_) is True
assert (u_ != u_) is False
assert (u_ < u_) is False
assert (u_ > u_) is False
assert (u_ <= u_) is True
assert (u_ >= u_) is True
u2 = xunicode("май"); assert type(u2) is unicode
assert (u_ == u2) is False ; assert (u2 == u_) is False
assert (u_ != u2) is True ; assert (u2 != u_) is True
assert (u_ < u2) is False ; assert (u2 < u_) is True
assert (u_ > u2) is True ; assert (u2 > u_) is False
assert (u_ <= u2) is False ; assert (u2 <= u_) is True
assert (u_ >= u2) is True ; assert (u2 >= u_) is False
# ---- benchmarks ----
# utf-8 decoding
......@@ -224,3 +345,17 @@ def bench_bencode(b):
# unicode correspondingly to function name.
def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x
def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x
# xstr returns string corresponding to specified type and data.
def xstr(text, typ):
def _():
t = {
bytes: xbytes,
unicode: xunicode,
bstr: b,
ustr: u,
}
return t[typ](text)
s = _()
assert type(s) is typ
return s
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment