Commit 2bb971ba authored by Kirill Smelkov's avatar Kirill Smelkov

X golang_str: Adjust bstr/ustr .encode() and .__bytes__ to leave string domain into bytes

Initially I implemented things in such a way that (b|u)str.__bytes__
were giving bstr and ustr.encode() was giving bstr as well. My logic
here was that bstr is based on bytes and it is ok to give that.

However this logic did not pass backward compatibility test: for example
when LXML is imported it does

    cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")

and under gpython it breaks with

      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/bin/runwsgi", line 4, in <module>
        from Products.ERP5.bin.zopewsgi import runwsgi; sys.exit(runwsgi())
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5/__init__.py", line 36, in <module>
        from Products.ERP5Type.Utils import initializeProduct, updateGlobals
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/__init__.py", line 42, in <module>
        from .patches import pylint
      File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/patches/pylint.py", line 524, in <module>
        __import__(module_name, fromlist=[module_name], level=0))
      File "src/lxml/sax.py", line 18, in init lxml.sax
      File "src/lxml/etree.pyx", line 154, in init lxml.etree
    TypeError: Expected bytes, got golang.bstr

The breakage highlights a thinko in my previous reasoning: yes bstr is based on
bytes, but bstr has different semantics compared to bytes: even though e.g.
__getitem__ works the same way for bytes on py2, it works differently compared
to py3. This way if on py3 a program is doing bytes(x) or x.encode() it then
expects the result to have bytes semantics of current python which is not the
case if the result is bstr.

-> Fix that by adjusting .encode() and .__bytes__() to produce bytes type of
   current python and leave string domain.

I initially was contemplating for some time to introduce a third type, e.g.
bvec also based on bytes, but having bytes semantic and that bvec.decode would
return back to pygolang strings domain. But due to the fact that bytes semantic
is different in between py2 and py3, it would mean that bvec provided by
pygolang would need to have different behaviours dependent on current python
version which is undesirable.

In the end with leaving into native bytes the "bytes inconsistency" problem is
left to remain under std python with pygolang targeting only to fix strings
inconsistency in between py2 and py3 and providing the same semantic for
bstr and ustr on all python versions.

It also does not harm that bytes.decode() returns std unicode instead of str:
for programs that run under unpatched python we have u() to convert the result
to ustr, while under gpython std unicode is actually ustr which makes
bytes.decode() behaviour still quite ok.

P.S. we enable bstr.encode for consistency and because under py2, if not
enabled, it will break when running pytest under gpython in

          File ".../_pytest/assertion/rewrite.py", line 352, in <module>
            RN = "\r\n".encode("utf-8")
        AttributeError: unreadable attribute
parent 28c353f8
......@@ -106,6 +106,7 @@ from cython cimport no_gc
from libc.stdio cimport FILE
from golang cimport strconv
import codecs as pycodecs
import string as pystring
import types as pytypes
import functools as pyfunctools
......@@ -343,9 +344,12 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↑ _pybstr__new__() .
def __bytes__(self): return pyb(self) # see __str__
def __unicode__(self): return pyu(self)
# __bytes__ converts string to bytes leaving string domain.
# NOTE __bytes__ and encode are the only operations that leave string domain.
# NOTE __bytes__ is used only by py3 and only for `bytes(obj)` and `b'%s/%b' % obj`.
def __bytes__(self): return _bdata(self) # -> bytes
def __unicode__(self): return pyu(self)
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self)
......@@ -482,13 +486,32 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
# encode/decode
def decode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
#
# Encoding strings - both bstr and ustr - convert type to bytes leaving string domain.
#
# Encode treats bstr and ustr as string, encoding unicode representation of
# the string to bytes. For bstr it means that the string representation is
# first converted to unicode and encoded to bytes from there. For ustr
# unicode representation of the string is directly encoded.
#
# Decoding strings is not provided. However for bstr the decode is provided
# treating input data as raw bytes and producing ustr as the result.
#
# NOTE __bytes__ and encode are the only operations that leave string domain.
def encode(self, encoding=None, errors=None): # -> bytes
encoding, errors = _encoding_with_defaults(encoding, errors)
# on py2 e.g. bytes.encode('string-escape') works on bytes directly
if PY_MAJOR_VERSION < 3:
codec = pycodecs.lookup(encoding)
if not codec._is_text_encoding or \
encoding in ('string-escape',): # string-escape also works on bytes
return codec.encode(self, errors)[0]
return pyu(self).encode(encoding, errors)
def decode(self, encoding=None, errors=None): # -> ustr | bstr on py2 for encodings like string-escape
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_decode_surrogateescape(self)
......@@ -499,11 +522,6 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711
return pyb(x)
return pyu(x)
if PY_MAJOR_VERSION < 3:
# whiteout encode inherited from bytes
# TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError
encode = property(doc='bstr has no encode')
# all other string methods
......@@ -667,9 +685,11 @@ cdef class _pyustr(unicode):
# _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↑ _pyustr__new__() .
def __bytes__(self): return pyb(self)
def __unicode__(self): return pyu(self) # see __str__
# __bytes__ converts string to bytes leaving string domain.
# see bstr.__bytes__ for more details.
def __bytes__(self): return _bdata(pyb(self)) # -> bytes
def __unicode__(self): return pyu(self) # see __str__
def __str__(self):
if PY_MAJOR_VERSION >= 3:
return pyu(self) # = self or pyustr if it was subclass
......@@ -793,20 +813,15 @@ cdef class _pyustr(unicode):
return pyu(zunicode.__format__(self, format_spec))
# encode/decode
def encode(self, encoding=None, errors=None):
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
# encode/decode (see bstr for details)
def encode(self, encoding=None, errors=None): # -> bytes
encoding, errors = _encoding_with_defaults(encoding, errors)
if encoding == 'utf-8' and errors == 'surrogateescape':
x = _utf8_encode_surrogateescape(self)
else:
x = zunicode.encode(self, encoding, errors)
return pyb(x)
return x
if PY_MAJOR_VERSION < 3:
# whiteout decode inherited from unicode
......@@ -1987,6 +2002,18 @@ cdef extern from "Python.h":
# ---- UTF-8 encode/decode ----
# _encoding_with_defaults returns encoding and errors substituted with defaults
# as needed for functions like ustr.encode and bstr.decode .
cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors)
if encoding is None and errors is None:
encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding
errors = 'surrogateescape'
else:
if encoding is None: encoding = 'utf-8'
if errors is None: errors = 'strict'
return (encoding, errors)
# TODO(kirr) adjust UTF-8 encode/decode surrogateescape(*) a bit so that not
# only bytes -> unicode -> bytes is always identity for any bytes (this is
# already true), but also that unicode -> bytes -> unicode is also always true
......@@ -2238,7 +2265,6 @@ cdef _patch_str():
# XXX explain
bpreserve_slots = upreserve_slots = ("maketrans",)
if PY_MAJOR_VERSION < 3:
bpreserve_slots += ("encode",) # @property'ies
upreserve_slots += ("decode",)
# patch unicode to be pyustr. This patches
......
......@@ -231,13 +231,15 @@ def test_strings_basic():
assert b(bs) is bs; assert bstr(bs) is bs
assert u(us) is us; assert ustr(us) is us
# bytes(b(·)) = identity, unicode(u(·)) = identity
assert bytes (bs) is bs
# unicode(u(·)) = identity
assert unicode(us) is us
# unicode(b) -> u, bytes(u) -> b
# unicode(b) -> u
_ = unicode(bs); assert type(_) is ustr; assert _ == "мир"
_ = bytes (us); assert type(_) is bstr; assert _ == "мир"
# bytes(b|u) -> bytes
_ = bytes(bs); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
_ = bytes(us); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
# bytearray(b|u) -> bytearray
_ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80'
......@@ -651,14 +653,13 @@ def test_strings_encodedecode():
us = u('мир')
bs = b('май')
_ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир')
_ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май')
_ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май')
# TODO also raise AttributeError on .encode/.decode lookup on classes
assert hasattr(us, 'encode') ; assert hasattr(ustr, 'encode')
assert not hasattr(bs, 'encode') #; assert not hasattr(bstr, 'encode')
assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode')
assert hasattr(bs, 'decode') ; assert hasattr(bstr, 'decode')
_ = us.encode(); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = us.encode('utf-8'); assert type(_) is bstr; assert _bdata(_) == xbytes('мир')
_ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май'
_ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май'
......@@ -673,10 +674,10 @@ def test_strings_encodedecode():
assert type(_) is ustr
assert _udata(_) == u'мир'
b_cpmir = us.encode('cp1251')
assert type(b_cpmir) is bstr
assert _bdata(b_cpmir) == u'мир'.encode('cp1251')
assert _bdata(b_cpmir) == b'\xec\xe8\xf0'
cpmir = us.encode('cp1251')
assert type(cpmir) is bytes
assert cpmir == u'мир'.encode('cp1251')
assert cpmir == b'\xec\xe8\xf0'
# decode/encode errors
u_k8mir = b_k8mir.decode() # no decode error with
......@@ -697,11 +698,14 @@ def test_strings_encodedecode():
us.encode('ascii')
_ = u_k8mir.encode() # no encode error with
assert type(_) is bstr # default parameters
assert _bdata(_) == k8mir
assert type(_) is bytes # default parameters
assert _ == k8mir
_ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with
assert type(_) is bstr # explicit utf-8/surrogateescape
assert _bdata(_) == k8mir
assert type(_) is bytes # explicit utf-8/surrogateescape
assert _ == k8mir
_ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode
assert type(_) is bytes
assert _ == k8mir
# on py2 unicode.encode accepts surrogate pairs and does not complain
# TODO(?) manually implement encode/py2 and reject surrogate pairs by default
......@@ -724,6 +728,14 @@ def test_strings_encodedecode():
_ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y"
_ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc"
# similarly for bytes.encode
if six.PY3:
with raises(LookupError): bs.encode('hex')
with raises(LookupError): bs.encode('string-escape')
else:
_ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9'
_ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
# verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.
@mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr))
......@@ -1418,6 +1430,8 @@ def test_strings_mod_and_format():
M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2
# vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument
# even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic
#
# XXX the code in bytesobject.c::format_obj tells different -> recheck.
if six.PY2:
M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2
M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2]
......@@ -1884,7 +1898,7 @@ def test_strings_subclasses(tx):
# for bstr/ustr __bytes__/__unicode__ return *str, never MyStr
# (builtin unicode has no __bytes__/__unicode__)
if tx is not unicode:
_ = xx.__bytes__(); assert type(_) is bstr; assert _ == 'мир'
_ = xx.__bytes__(); assert type(_) is bytes; assert _ == xbytes('мир')
_ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment