X golang_str: Add ustr.decode for symmetry with bstr.decode and because gpy2 breaks without it

Without working unicode.decode gpy2 fails when running ERP5 as follows: $ /srv/slapgrid/slappart49/t/ekg/i/5/bin/runTestSuite --help No handlers could be found for logger "SecurityInfo" Traceback (most recent call last): File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/.runTestSuite.pyexe", line 296, in <module> main() File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 484, in main pymain(argv, init) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 292, in pymain run(mmain) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 192, in run _execfile(filepath, mmain.__dict__) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 339, in _execfile six.exec_(code, globals, locals) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/six-1.16.0-py2.7.egg/six.py", line 735, in exec_ exec("""exec _code_ in _globs_, _locs_""") File "<string>", line 1, in <module> File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/runTestSuite", line 10, in <module> from Products.ERP5Type.tests.runTestSuite import main; sys.exit(main()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/__init__.py", line 96, in <module> from . import ZopePatch File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/ZopePatch.py", line 75, in <module> from Products.ERP5Type.patches import ZopePageTemplateUtils File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/patches/ZopePageTemplateUtils.py", line 58, in <module> convertToUnicode(u'', 'text/xml', ()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/Zope-4.8.9+slapospatched002-py2.7.egg/Products/PageTemplates/utils.py", line 73, in convertToUnicode return source.decode(encoding), encoding AttributeError: unreadable attribute and in general if we treat both bstr ans ustr being two different representations of the same entity, if we have bstr.decode, having ustr.decode is also needed for symmetry with both operations converting bytes representation of the string into unicode. Now there is full symmetry in between bstr/ustr and encode/decode. Quoting updated encode/decode text: Encode encodes unicode representation of the string into bytes, leaving string domain. Decode decodes bytes representation of the string into ustr, staying inside string domain. Both bstr and ustr are accepted by encode and decode treating them as two different representations of the same entity. On encoding, for bstr, the string representation is first converted to unicode and encoded to bytes from there. For ustr unicode representation of the string is directly encoded. On decoding, for ustr, the string representation is first converted to bytes and decoded to unicode from there. For bstr bytes representation of the string is directly decoded.

X golang_str: Add ustr.decode for symmetry with bstr.decode and because gpy2 breaks without it
Without working unicode.decode gpy2 fails when running ERP5 as follows: $ /srv/slapgrid/slappart49/t/ekg/i/5/bin/runTestSuite --help No handlers could be found for logger "SecurityInfo" Traceback (most recent call last): File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/.runTestSuite.pyexe", line 296, in <module> main() File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 484, in main pymain(argv, init) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 292, in pymain run(mmain) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 192, in run _execfile(filepath, mmain.__dict__) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 339, in _execfile six.exec_(code, globals, locals) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/six-1.16.0-py2.7.egg/six.py", line 735, in exec_ exec("""exec _code_ in _globs_, _locs_""") File "<string>", line 1, in <module> File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/runTestSuite", line 10, in <module> from Products.ERP5Type.tests.runTestSuite import main; sys.exit(main()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/__init__.py", line 96, in <module> from . import ZopePatch File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/ZopePatch.py", line 75, in <module> from Products.ERP5Type.patches import ZopePageTemplateUtils File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/patches/ZopePageTemplateUtils.py", line 58, in <module> convertToUnicode(u'', 'text/xml', ()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/Zope-4.8.9+slapospatched002-py2.7.egg/Products/PageTemplates/utils.py", line 73, in convertToUnicode return source.decode(encoding), encoding AttributeError: unreadable attribute and in general if we treat both bstr ans ustr being two different representations of the same entity, if we have bstr.decode, having ustr.decode is also needed for symmetry with both operations converting bytes representation of the string into unicode. Now there is full symmetry in between bstr/ustr and encode/decode. Quoting updated encode/decode text: Encode encodes unicode representation of the string into bytes, leaving string domain. Decode decodes bytes representation of the string into ustr, staying inside string domain. Both bstr and ustr are accepted by encode and decode treating them as two different representations of the same entity. On encoding, for bstr, the string representation is first converted to unicode and encoded to bytes from there. For ustr unicode representation of the string is directly encoded. On decoding, for ustr, the string representation is first converted to bytes and decoded to unicode from there. For bstr bytes representation of the string is directly decoded.
93e9c25a · Kirill Smelkov · abf3dcec · 93e9c25a · 93e9c25a
Commit 93e9c25a authored May 10, 2024 by Kirill Smelkov
Show whitespace changes
Inline Side-by-side

Showing with 100 additions and 67 deletions

golang/_golang_str.pyx golang/_golang_str.pyx +41 -20

golang/golang_str_test.py golang/golang_str_test.py +59 -47

No files found.
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -528,25 +528,31 @@ cdef class _pybstr(bytes):   # https://github.com/cython/cython/issues/711

    # encode/decode
    #
-    # Encoding strings - both bstr and ustr - convert type to bytes leaving string domain.
+    # Encode encodes unicode representation of the string into bytes, leaving string domain.
+    # Decode decodes bytes   representation of the string into ustr, staying inside string domain.
    #
-    # Encode treats bstr and ustr as string, encoding unicode representation of
-    # the string to bytes. For bstr it means that the string representation is
-    # first converted to unicode and encoded to bytes from there. For ustr
-    # unicode representation of the string is directly encoded.
+    # Both bstr and ustr are accepted by encode and decode treating them as two
+    # different representations of the same entity.
    #
-    # Decoding strings is not provided. However for bstr the decode is provided
-    # treating input data as raw bytes and producing ustr as the result.
+    # On encoding, for bstr, the string representation is first converted to
+    # unicode and encoded to bytes from there. For ustr unicode representation
+    # of the string is directly encoded.
+    #
+    # On decoding, for ustr, the string representation is first converted to
+    # bytes and decoded to unicode from there. For bstr bytes representation of
+    # the string is directly decoded.
    #
    # NOTE __bytes__ and encode are the only operations that leave string domain.
    def encode(self, encoding=None, errors=None): # -> bytes
        encoding, errors = _encoding_with_defaults(encoding, errors)

+        if encoding == 'utf-8'  and  errors == 'surrogateescape':
+            return _bdata(self)
+
        # on py2 e.g. bytes.encode('string-escape') works on bytes directly
        if PY_MAJOR_VERSION < 3:
-            codec = pycodecs.lookup(encoding)
-            if not codec._is_text_encoding or \
-               encoding in ('string-escape',):  # string-escape also works on bytes
+            codec = _pycodecs_lookup_binary(encoding)
+            if codec is not None:
                return codec.encode(self, errors)[0]

        return pyu(self).encode(encoding, errors)
@@ -894,15 +900,23 @@ cdef class _pyustr(unicode):
        encoding, errors = _encoding_with_defaults(encoding, errors)

        if encoding == 'utf-8'  and  errors == 'surrogateescape':
-            x = _utf8_encode_surrogateescape(self)
-        else:
-            x = zunicode.encode(self, encoding, errors)
-        return x
+            return _utf8_encode_surrogateescape(self)

+        # on py2 e.g. 'string-escape' works on bytes
        if PY_MAJOR_VERSION < 3:
-        # whiteout decode inherited from unicode
-        # TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError
-        decode = property(doc='ustr has no decode')
+            codec = _pycodecs_lookup_binary(encoding)
+            if codec is not None:
+                return codec.encode(pyb(self), errors)[0]
+
+        return zunicode.encode(self, encoding, errors)
+
+    def decode(self, encoding=None, errors=None): # -> ustr | bstr for  encodings like string-escape
+        encoding, errors = _encoding_with_defaults(encoding, errors)
+
+        if encoding == 'utf-8'  and  errors == 'surrogateescape':
+            return pyu(self)
+
+        return pyb(self).decode(encoding, errors)


    # all other string methods
@@ -2161,6 +2175,15 @@ cdef extern from "Python.h":
    """
    bint _XPyMapping_Check(object o)

+# _pycodecs_lookup_binary returns codec corresponding to encoding if the codec works on binary input.
+# example of such codecs are string-escape and hex encodings.
+cdef _pycodecs_lookup_binary(encoding): # -> codec | None (text) | LookupError (no such encoding)
+    codec = pycodecs.lookup(encoding)
+    if not codec._is_text_encoding or \
+       encoding in ('string-escape',):  # string-escape also works on bytes
+        return codec
+    return None
+

 # ---- UTF-8 encode/decode ----

@@ -2426,8 +2449,6 @@ cdef _patch_str():

    # XXX explain
    bpreserve_slots = upreserve_slots = ("maketrans",)
-    if PY_MAJOR_VERSION < 3:
-        upreserve_slots += ("decode",)

    # patch unicode to be pyustr. This patches
    # - unicode (py2)

--- a/golang/golang_str_test.py
+++ b/golang/golang_str_test.py
@@ -657,58 +657,61 @@ def test_strings_encodedecode():
    us = u('мир')
    bs = b('май')

-    _ = us.encode();         assert type(_) is bytes; assert _ == xbytes('мир')
-    _ = us.encode('utf-8');  assert type(_) is bytes; assert _ == xbytes('мир')
-    _ = bs.encode();         assert type(_) is bytes; assert _ == xbytes('май')
-    _ = bs.encode('utf-8');  assert type(_) is bytes; assert _ == xbytes('май')
+    # encode does obj.encode and makes sure result type is bytes
+    def encode(obj, *argv):
+        _ = obj.encode(*argv)
+        assert type(_) is bytes
+        return _

-    # TODO also raise AttributeError on .encode/.decode lookup on classes
-    assert not hasattr(us, 'decode')  #;   assert not hasattr(ustr, 'decode')
-    _ = bs.decode();         assert type(_) is ustr;  assert _udata(_) == u'май'
-    _ = bs.decode('utf-8');  assert type(_) is ustr;  assert _udata(_) == u'май'
+    # decode does obj.decode and makes sure result type is ustr
+    def decode(obj, *argv):
+        _ = obj.decode(*argv)
+        assert type(_) is ustr
+        return _

-    # !utf-8
-    k8mir = u'мир'.encode('koi8-r')
-    b_k8mir = b(k8mir)
-    assert type(b_k8mir) is bstr
-    assert _bdata(b_k8mir) == k8mir
-    assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
+    _ = encode(us);           assert _ == xbytes('мир')
+    _ = encode(us, 'utf-8');  assert _ == xbytes('мир')
+    _ = encode(bs);           assert _ == xbytes('май')
+    _ = encode(bs, 'utf-8');  assert _ == xbytes('май')

-    _ = b_k8mir.decode('koi8-r')
-    assert type(_) is ustr
-    assert _udata(_) == u'мир'
+    _ = decode(us);           assert _udata(_) == u'мир'
+    _ = decode(us, 'utf-8');  assert _udata(_) == u'мир'
+    _ = decode(bs);           assert _udata(_) == u'май'
+    _ = decode(bs, 'utf-8');  assert _udata(_) == u'май'

-    cpmir = us.encode('cp1251')
-    assert type(cpmir) is bytes
-    assert cpmir == u'мир'.encode('cp1251')
-    assert cpmir == b'\xec\xe8\xf0'
+    # !utf-8
+    k8mir = u'мир'.encode('koi8-r');  assert k8mir == b'\xcd\xc9\xd2'
+    b_k8mir = b(k8mir);  assert type(b_k8mir) is bstr;  assert _bdata(b_k8mir) == b'\xcd\xc9\xd2'
+    u_k8mir = u(k8mir);  assert type(u_k8mir) is ustr;  assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'

-    # decode/encode errors
-    u_k8mir = b_k8mir.decode()                          # no decode error with
-    assert type(u_k8mir) is ustr                        # default parameters
-    assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2'
-    _ = b_k8mir.decode('utf-8', 'surrogateescape')      # no decode error with
-    assert type(_) is ustr                              # explicit utf-8/surrogateescape
-    assert _udata(_) == _udata(u_k8mir)
-
-    with raises(UnicodeDecodeError):  # decode error if encoding is explicitly specified
-        b_k8mir.decode('utf-8')
-    with raises(UnicodeDecodeError):
-        b_k8mir.decode('utf-8', 'strict')
-    with raises(UnicodeDecodeError):
-        b_k8mir.decode('ascii')
+    _ = decode(b_k8mir, 'koi8-r');  assert _udata(_) == u'мир'
+    _ = decode(u_k8mir, 'koi8-r');  assert _udata(_) == u'мир'

-    with raises(UnicodeEncodeError):
-        us.encode('ascii')
+    _ = encode(us, 'cp1251');  assert _ == u'мир'.encode('cp1251');  assert _ == b'\xec\xe8\xf0'
+    _ = encode(bs, 'cp1251');  assert _ == u'май'.encode('cp1251');  assert _ == b'\xec\xe0\xe9'

-    _ = u_k8mir.encode()                                # no encode error with
-    assert type(_) is bytes                             # default parameters
-    assert _ == k8mir
-    _ = u_k8mir.encode('utf-8', 'surrogateescape')      # no encode error with
-    assert type(_) is bytes                             # explicit utf-8/surrogateescape
+    # decode/encode errors
+    _ = decode(b_k8mir);  assert _ == u_k8mir           # no decode error with default parameters
+    _ = decode(b_k8mir, 'utf-8', 'surrogateescape')     # or with explicit utf-8/surrogateescape
+    assert _ == u_k8mir
+    _ = decode(u_k8mir);  assert _ == u_k8mir
+    _ = decode(u_k8mir, 'utf-8', 'surrogateescape');  assert _ == u_k8mir
+
+    with raises(UnicodeDecodeError):  b_k8mir.decode('utf-8')   # decode error on unmatching explicit encoding
+    with raises(UnicodeDecodeError):  u_k8mir.decode('utf-8')
+    with raises(UnicodeDecodeError):  b_k8mir.decode('utf-8', 'strict')
+    with raises(UnicodeDecodeError):  u_k8mir.decode('utf-8', 'strict')
+    with raises(UnicodeDecodeError):  b_k8mir.decode('ascii')
+    with raises(UnicodeDecodeError):  u_k8mir.decode('ascii')
+
+    with raises(UnicodeEncodeError):  us.encode('ascii')    # encode error if target encoding cannot represent string
+    with raises(UnicodeEncodeError):  bs.encode('ascii')
+
+    _ = encode(u_k8mir);  assert _ == k8mir             # no encode error with default parameters
+    _ = encode(u_k8mir, 'utf-8', 'surrogateescape')     # or with explicit utf-8/surrogateescape
    assert _ == k8mir
-    _ = b_k8mir.encode()                                # bstr.encode = bstr -> ustr -> encode
-    assert type(_) is bytes
+    _ = encode(b_k8mir);  assert _ == k8mir             # bstr.encode = bstr -> ustr -> encode
+    _ = encode(b_k8mir, 'utf-8', 'surrogateescape')
    assert _ == k8mir

    # on py2 unicode.encode accepts surrogate pairs and does not complain
@@ -726,19 +729,28 @@ def test_strings_encodedecode():
    # verify that this exact semantic is preserved
    if six.PY3:
        with raises(LookupError):  bs.decode('hex')
+        with raises(LookupError):  us.decode('hex')
        with raises(LookupError):  bs.decode('string-escape')
+        with raises(LookupError):  us.decode('string-escape')
    else:
        _ = bs.decode('string-escape');          assert type(_) is bstr;  assert _ == bs
+        _ = us.decode('string-escape');          assert type(_) is bstr;  assert _ == us
        _ = b(r'x\'y').decode('string-escape');  assert type(_) is bstr;  assert _bdata(_) == b"x'y"
+        _ = u(r'x\'y').decode('string-escape');  assert type(_) is bstr;  assert _bdata(_) == b"x'y"
        _ = b('616263').decode('hex');           assert type(_) is bstr;  assert _bdata(_) == b"abc"
+        _ = u('616263').decode('hex');           assert type(_) is bstr;  assert _bdata(_) == b"abc"

    # similarly for bytes.encode
    if six.PY3:
        with raises(LookupError):  bs.encode('hex')
+        with raises(LookupError):  us.encode('hex')
        with raises(LookupError):  bs.encode('string-escape')
+        with raises(LookupError):  us.encode('string-escape')
    else:
-        _ = bs.encode('hex');            assert type(_) is bytes;  assert _ == b'd0bcd0b0d0b9'
-        _ = bs.encode('string-escape');  assert type(_) is bytes;  assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
+        _ = encode(bs, 'hex');            assert _ == b'd0bcd0b0d0b9'
+        _ = encode(us, 'hex');            assert _ == b'd0bcd0b8d180'
+        _ = encode(bs, 'string-escape');  assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9'
+        _ = encode(us, 'string-escape');  assert _ == br'\xd0\xbc\xd0\xb8\xd1\x80'


 # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.