Commit 781802d4 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Implement bstr/ustr constructors

Both bstr and ustr constructors mimic constructor of unicode(= str on py3) -
an object is either stringified, or decoded if it provides buffer
interface, or the constructor is invoked with optional encoding and
errors argument:

    # py2
    class unicode(basestring)
     |  unicode(object='') -> unicode object
     |  unicode(string[, encoding[, errors]]) -> unicode object

    # py3
    class str(object)
     |  str(object='') -> str
     |  str(bytes_or_buffer[, encoding[, errors]]) -> str

Stringification of all bstr/ustr / unicode/bytes is handled
automatically with the meaning to convert to created type via b or u.

We follow unicode semantic for both ustr _and_ bstr, because bstr/ustr
are intended to be used as strings.
parent 54c2a3cf
......@@ -233,8 +233,8 @@ Pygolang, similarly to Go, provides uniform UTF8-based approach to strings with
the idea to make working with byte- and unicode- strings easy and transparently
interoperable:
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to `bytes`.
- `bstr` is byte-string: it is based on `bytes` and can automatically convert to/from `unicode` [*]_.
- `ustr` is unicode-string: it is based on `unicode` and can automatically convert to/from `bytes`.
The conversion, in both encoding and decoding, never fails and never looses
information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity
......@@ -245,7 +245,9 @@ operations in between `ustr` and `bstr`/`bytes` / `unicode` coerce
to `ustr`. When the coercion happens, `bytes`, similarly to
`bstr`, are also treated as UTF8-encoded strings.
`bstr`/`ustr` constructors will accept arbitrary objects and either convert or stringify them. For
`bstr` and `ustr` are meant to be drop-in replacements for standard
`str`/`unicode` classes. They support all methods of `str`/`unicode` and in
particular their constructors accept arbitrary objects and either convert or stringify them. For
cases when no stringification is desired, and one only wants to convert
`bstr`/`ustr` / `unicode`/`bytes`
to Pygolang string, `b` and `u` provide way to make sure an
......
......@@ -51,17 +51,10 @@ def pyb(s): # -> bstr
See also: u, bstr/ustr.
"""
if type(s) is pybstr:
return s
if isinstance(s, bytes): # py2: str py3: bytes
pass
elif isinstance(s, unicode): # py2: unicode py3: str
s = _utf8_encode_surrogateescape(s)
else:
bs = _pyb(pybstr, s)
if bs is None:
raise TypeError("b: invalid type %s" % type(s))
return pybstr(s)
return bs
def pyu(s): # -> ustr
"""u converts object to ustr.
......@@ -81,17 +74,41 @@ def pyu(s): # -> ustr
See also: b, bstr/ustr.
"""
if type(s) is pyustr:
us = _pyu(pyustr, s)
if us is None:
raise TypeError("u: invalid type %s" % type(s))
return us
cdef _pyb(bcls, s): # -> ~bstr | None
if type(s) is bcls:
return s
if isinstance(s, unicode): # py2: unicode py3: str
pass
elif isinstance(s, bytes): # py2: str py3: bytes
if isinstance(s, bytes):
if type(s) is not bytes:
s = _bdata(s)
elif isinstance(s, unicode):
s = _utf8_encode_surrogateescape(s)
else:
return None
assert type(s) is bytes
return bytes.__new__(bcls, s)
cdef _pyu(ucls, s): # -> ~ustr | None
if type(s) is ucls:
return s
if isinstance(s, unicode):
if type(s) is not unicode:
s = _udata(s)
elif isinstance(s, bytes):
s = _utf8_decode_surrogateescape(s)
else:
raise TypeError("u: invalid type %s" % type(s))
return None
return pyustr(s)
assert type(s) is unicode
return unicode.__new__(ucls, s)
# _pyb_coerce coerces x from `b op x` to be used in operation with pyb.
......@@ -136,7 +153,7 @@ cdef __pystr(object obj): # -> ~str
class pybstr(bytes):
"""bstr is byte-string.
It is based on bytes and can automatically convert to unicode.
It is based on bytes and can automatically convert to/from unicode.
The conversion never fails and never looses information:
bstr → ustr → bstr
......@@ -147,6 +164,15 @@ class pybstr(bytes):
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
bstr constructor accepts arbitrary objects and stringify them:
- if encoding and/or errors is specified, the object must provide buffer
interface. The data in the buffer is decoded according to provided
encoding/errors and further encoded via UTF-8 into bstr.
- if the object is bstr/ustr / unicode/bytes - it is converted
to bstr. See b for details.
- otherwise bstr will have string representation of the object.
See also: b, ustr/u.
"""
......@@ -154,6 +180,18 @@ class pybstr(bytes):
# won't be needed after switch to -> `cdef class`
__slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
bobj = _pyb(cls, object)
assert bobj is not None
return bobj
def __bytes__(self): return self
def __unicode__(self): return pyu(self)
......@@ -191,10 +229,11 @@ class pybstr(bytes):
def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b))
cdef class pyustr(unicode):
# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799
class pyustr(unicode):
"""ustr is unicode-string.
It is based on unicode and can automatically convert to bytes.
It is based on unicode and can automatically convert to/from bytes.
The conversion never fails and never looses information:
ustr → bstr → ustr
......@@ -205,9 +244,29 @@ cdef class pyustr(unicode):
When the coercion happens, bytes, similarly to bstr, are also
treated as UTF8-encoded strings.
ustr constructor, similarly to the one in bstr, accepts arbitrary objects
and stringify them. Please refer to bstr and u documentation for details.
See also: u, bstr/b.
"""
# don't allow to set arbitrary attributes.
# won't be needed after switch to -> `cdef class`
__slots__ = ()
def __new__(cls, object='', encoding=None, errors=None):
# encoding or errors -> object must expose buffer interface
if not (encoding is None and errors is None):
object = _buffer_decode(object, encoding, errors)
# _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented
object = _bstringify(object)
assert isinstance(object, (unicode, bytes)), object
uobj = _pyu(cls, object)
assert uobj is not None
return uobj
def __bytes__(self): return pyb(self)
def __unicode__(self): return self
......@@ -312,6 +371,37 @@ def pyqq(obj):
return qobj
# ---- _bstringify ----
# _bstringify returns string representation of obj.
# it is similar to unicode(obj).
cdef _bstringify(object obj): # -> unicode|bytes
if type(obj) in (pybstr, pyustr, bytes, unicode):
return obj
if PY_MAJOR_VERSION >= 3:
return unicode(obj)
else:
# on py2 mimic manually what unicode(·) does on py3
# the reason we do it manually is because if we try just
# unicode(obj), and obj's __str__ returns UTF-8 bytestring, it will
# fail with UnicodeDecodeError. Similarly if we unconditionally do
# str(obj), it will fail if obj's __str__ returns unicode.
if hasattr(obj, '__unicode__'):
return obj.__unicode__()
elif hasattr(obj, '__str__'):
# (u'β').__str__() gives UnicodeEncodeError, but unicode has no
# .__unicode__ method. Work it around to handle custom unicode
# subclasses that do not override __str__.
if type(obj).__str__ is unicode.__str__:
return unicode(obj)
return obj.__str__()
else:
return repr(obj)
# py2: adjust unicode.tp_richcompare(a,b) to return NotImplemented if b is bstr.
# This way we avoid `UnicodeWarning: Unicode equal comparison failed to convert
# both arguments to Unicode - interpreting them as being unequal`, and that
......@@ -381,6 +471,42 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3
return pyfunctools.partial(self.func, obj)
# ---- misc ----
# _buffer_py2 returns buffer(obj) on py2 / fails on py3
cdef object _buffer_py2(object obj):
IF PY2: # cannot `if PY_MAJOR_VERSION < 3` because then cython errors
return buffer(obj) # "undeclared name not builtin: buffer"
ELSE:
raise AssertionError("must be called only on py2")
# _buffer_decode decodes buf to unicode according to encoding and errors.
#
# buf must expose buffer interface.
# encoding/errors can be None meaning to use default utf-8/strict.
cdef unicode _buffer_decode(buf, encoding, errors):
if encoding is None: encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
if errors is None: errors = 'strict'
if _XPyObject_CheckOldBuffer(buf):
buf = _buffer_py2(buf)
else:
buf = memoryview(buf)
return bytearray(buf).decode(encoding, errors)
cdef extern from "Python.h":
"""
static int _XPyObject_CheckOldBuffer(PyObject *o) {
#if PY_MAJOR_VERSION >= 3
// no old-style buffers on py3
return 0;
#else
return PyObject_CheckReadBuffer(o);
#endif
}
"""
bint _XPyObject_CheckOldBuffer(object o)
# ---- UTF-8 encode/decode ----
from six import unichr # py2: unichr py3: chr
......@@ -472,7 +598,7 @@ def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode
def _utf8_encode_surrogateescape(s): # -> bytes
assert isinstance(s, unicode)
if PY_MAJOR_VERSION >= 3:
return s.encode('UTF-8', 'surrogateescape')
return unicode.encode(s, 'UTF-8', 'surrogateescape')
# py2 does not have surrogateescape error handler, and even if we
# provide one, builtin unicode.encode() does not treat
......
......@@ -28,8 +28,20 @@ from golang.strconv_test import byterange
from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE
from pytest import raises, mark, skip
import sys
import six
from six import text_type as unicode
from six.moves import range as xrange
import array
# buftypes lists types with buffer interface that we will test against.
buftypes = [
bytearray,
memoryview,
lambda x: array.array('B', x),
]
if six.PY2:
buftypes.append(buffer) # no buffer on py3
# verify b/u and bstr/ustr basics.
......@@ -123,6 +135,17 @@ def test_strings_basic():
with raises(TypeError): b(object())
with raises(TypeError): u(object())
# bstr/ustr - similarly to str - accept arbitrary objects
_ = bstr(); assert type(_) is bstr; assert _ == ''
_ = ustr(); assert type(_) is ustr; assert _ == ''
_ = bstr(123); assert type(_) is bstr; assert _ == '123'
_ = ustr(123); assert type(_) is ustr; assert _ == '123'
_ = bstr([1,'b']); assert type(_) is bstr; assert _ == "[1, 'b']"
_ = ustr([1,'b']); assert type(_) is ustr; assert _ == "[1, 'b']"
obj = object()
_ = bstr(obj); assert type(_) is bstr; assert _ == str(obj) # <object ...>
_ = ustr(obj); assert type(_) is ustr; assert _ == str(obj) # <object ...>
b_ = xbytes ("мир"); assert type(b_) is bytes
u_ = xunicode ("мир"); assert type(u_) is unicode
......@@ -130,17 +153,46 @@ def test_strings_basic():
# b/u from unicode
bs = b(u_); assert isinstance(bs, bytes); assert type(bs) is bstr
us = u(u_); assert isinstance(us, unicode); assert type(us) is ustr
_ = bstr(u_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(u_); assert type(_) is ustr; assert _ == "мир"
# b/u from bytes
_ = b(b_); assert type(_) is bstr; assert _ == "мир"
_ = u(b_); assert type(_) is ustr; assert _ == "мир"
_ = bstr(b_); assert type(_) is bstr; assert _ == "мир"
_ = ustr(b_); assert type(_) is ustr; assert _ == "мир"
# TODO also handle bytearray?
# bstr/ustr from bytes/buffer with encoding
k8mir_bytes = u"мир".encode('koi8-r')
for tbuf in [bytes] + buftypes:
k8mir = tbuf(k8mir_bytes)
_ = bstr(k8mir, 'koi8-r'); assert type(_) is bstr; assert _ == "мир"
_ = ustr(k8mir, 'koi8-r'); assert type(_) is ustr; assert _ == "мир"
with raises(UnicodeDecodeError): bstr(k8mir, 'ascii')
with raises(UnicodeDecodeError): ustr(k8mir, 'ascii')
_ = bstr(k8mir, 'ascii', 'replace'); assert type(_) is bstr; assert _ == u'\ufffd\ufffd\ufffd'
_ = ustr(k8mir, 'ascii', 'replace'); assert type(_) is ustr; assert _ == u'\ufffd\ufffd\ufffd'
# no encoding -> utf8 with surrogateescape for bytes, stringify for the rest
k8mir_usurrogateescape = u'\udccd\udcc9\udcd2'
k8mir_strok = k8mir_usurrogateescape
if not tbuf in (bytes,):
k8mir_strok = str(k8mir) # e.g. '<memory at ...>' for memoryview
_ = bstr(k8mir); assert type(_) is bstr; assert _ == k8mir_strok
_ = ustr(k8mir); assert type(_) is ustr; assert _ == k8mir_strok
# encoding specified -> treat it precisely
with raises(UnicodeDecodeError): bstr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, 'utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): ustr(k8mir, encoding='utf-8')
with raises(UnicodeDecodeError): bstr(k8mir, errors='strict')
with raises(UnicodeDecodeError): ustr(k8mir, errors='strict')
# b(b(·)) = identity, u(u(·)) = identity
assert b(bs) is bs
assert u(us) is us
assert b(bs) is bs; assert bstr(bs) is bs
assert u(us) is us; assert ustr(us) is us
# bytes(b(·)) = identity, unicode(u(·)) = identity
assert bytes (bs) is bs
......@@ -274,6 +326,44 @@ def test_strings_print():
assertDoc(outok, stdout)
# verify behaviour of bstr|ustr subclasses.
@mark.parametrize('tx', (unicode, bstr, ustr))
def test_strings_subclasses(tx):
x = xstr(u'мир', tx); assert type(x) is tx
# subclass without __str__
class MyStr(tx):
pass
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == x # e.g. unicode(MyStr) -> unicode, not MyStr
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'мир'
_ = b(xx); assert type(_) is bstr ; assert _ == 'мир'
_ = u(xx); assert type(_) is ustr ; assert _ == 'мир'
# subclass with __str__
class MyStr(tx):
def __str__(self): return u'αβγ'
__unicode__ = __str__
xx = MyStr(x); assert type(xx) is MyStr
_ = tx(xx); assert type(_) is tx ; assert _ == u'αβγ' # unicode(MyStr) -> u'αβγ', not 'мир'
_ = bstr(xx); assert type(_) is bstr ; assert _ == u'αβγ'
_ = ustr(xx); assert type(_) is ustr ; assert _ == u'αβγ'
_ = b(xx); assert type(_) is bstr ; assert _ == u'мир' # b(MyStr) -> 'мир', not 'αβγ'
_ = u(xx); assert type(_) is ustr ; assert _ == u'мир'
# non-subclass with __str__ (for completeness)
class MyObj(object):
def __str__(self):
return 'myobj'
xx = MyObj(); assert type(xx) is MyObj
_ = tx(xx); assert type(_) is tx ; assert _ == 'myobj'
_ = bstr(xx); assert type(_) is bstr ; assert _ == 'myobj'
_ = ustr(xx); assert type(_) is ustr ; assert _ == 'myobj'
with raises(TypeError): b(xx) # NOTE b/u reports "convertion failure"
with raises(TypeError): u(xx)
def test_qq():
# NOTE qq is also tested as part of strconv.quote
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment