Commit 9ef32517 authored by Kirill Smelkov's avatar Kirill Smelkov

golang_str: Revert adding buffer interface to ustr

Testing this change on upcoming gpython/py3 with str patched to be ustr
revealed compatibility breakage against several places in standard
library. One example of such a breakage is os.listdir, which after
doing PyObject_CheckBuffer decides to return bytes instead of unicode in
the result:

    https://github.com/python/cpython/blob/v3.11.9-9-g1b0e63c81b5/Modules/posixmodule.c#L4194-L4195

which makes e.g. pytest to fail to work with

    $ gpython -m pytest -vsx
    ...
      File ".../lib/python3.11/pathlib.py", line 370, in _select_from
        if self.match(name):
           ^^^^^^^^^^^^^^^^
    TypeError: cannot use a string pattern on a bytes-like object

This was immediately-seen breakage even without trying to run ERP5 on
top of gpy3. So in general adding buffer interface to ustr is believed to
break too much compatibility with standard unicode on py3 that we
decided against it.

-> Revert adding buffer interface to ustr.

This effectively reverts 8a240b5b (golang_str: Fix ustr to provide
buffer interface, like bstr already does), but leaves added/updated
tests and comments there about why making memoryview(ustr) turned out to
be not a good idea.
parent 8a240b5b
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Copyright (C) 2018-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -30,17 +30,17 @@ from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE
from cpython.iterobject cimport PySeqIter_New
from cpython cimport PyThreadState_GetDict, PyDict_SetItem
from cpython cimport PyObject_CheckBuffer, PyBuffer_FillInfo, Py_SIZE
from cpython cimport PyObject_CheckBuffer
cdef extern from "Python.h":
PyTypeObject PyBytes_Type
ctypedef struct PyBytesObject:
char *ob_sval
pass
cdef extern from "Python.h":
PyTypeObject PyUnicode_Type
ctypedef struct PyUnicodeObject:
PyObject *defenc # NOTE py2 only; ~ .utf8 on py3
pass
cdef extern from "Python.h":
"""
......@@ -1006,63 +1006,6 @@ cdef class _pyustr(unicode):
return t
# buffer interface so that ustr can be automatically converted to bytes for
# e.g. PyArg_Parse("s#") and memoryview.
def __getbuffer__(self, Py_buffer *buf, int flags):
# TODO py2: use .defenc directly if present (via _pyustr_getbuf)
# TODO py3: use .utf8 directly if present
bself = pyb(self)
bbself = <PyBytesObject*>bself
PyBuffer_FillInfo(buf, bself, bbself.ob_sval, Py_SIZE(bself), 1, flags)
# keep .bf_releasebuffer = NULL
# e.g. for t# py2 rejects conversion if it is !NULL with
# "argument ... must be string or pinned buffer"
# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Python/getargs.c#L1356-L1391
#def __releasebuffer__(self, Py_buffer *buf):
# pass
# old-style buffer - used by py2
IF PY2:
def __getreadbuffer__(self, Py_ssize_t idx, void **pptr):
return _pyustr_getbuf(self, idx, pptr)
def __getcharbuffer__(self, Py_ssize_t idx, char **pptr):
return _pyustr_getbuf(self, idx, <void**>pptr)
def __getsegcount__(self, Py_ssize_t *lenp):
cdef void *_
if lenp != NULL:
lenp[0] = _pyustr_getbuf(self, 0, &_)
return 1
IF PY2:
# _pyustr_getbuf returns pointer to bytes data that correspond to ustr content.
#
# its definition is kept outside pyustr class becase
# vtab is still created even with `@staticmethod cdef ...`
# https://github.com/cython/cython/issues/5337
# so we work it around via out-of-class definition
cdef Py_ssize_t _pyustr_getbuf(self, Py_ssize_t idx, void **pptr) except -1:
if idx != 0:
raise SystemError("accessing non-existent string segment")
uself = <PyUnicodeObject*>self
cdef PyObject* xbcopy = uself.defenc
if xbcopy == NULL:
bcopy = pyb(self)
Py_INCREF(bcopy)
xbcopy = <PyObject*>bcopy
uself.defenc = xbcopy
else:
bcopy = <object>xbcopy
assert isinstance(bcopy, bytes)
pptr[0] = (<PyBytesObject*>xbcopy).ob_sval
return Py_SIZE(bcopy)
# hand-made _pyustr.__new__ (workaround for https://github.com/cython/cython/issues/799)
cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL:
argv = ()
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Copyright (C) 2018-2025 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -314,20 +314,40 @@ def test_strings_refcount():
# verify memoryview(bstr|ustr).
_ = (memoryview,)
if six.PY2:
# also verify buffer() on py2
def mbuffer(x):
return memoryview(buffer(x))
_ += (mbuffer,)
@mark.parametrize('tx', (bytes, bstr, ustr))
@mark.parametrize('mview', _)
def test_strings_memoryview(tx, mview):
# NOTE memoryview/buffer work for both bstr and ustr. In particular
# memoryview(ustr) does not raise TypeError and instead returns memoryview
# for bytes-representation of ustr.
def test_strings_memoryview(tx):
# NOTE memoryview works for both bytes and bstr but not for ustr.
#
# Even though it is technically possible(*) we cannot make memoryview(ustr)
# to work as it will result in breakage on gpython/py3 because many places
# in stdlib assume that if buffer interface is provided then the object is
# not a string. One example of such a place is os.listdir, which after
# doing PyObject_CheckBuffer decides to return bytes instead of unicode in
# the result:
#
# https://github.com/python/cpython/blob/v3.11.9-9-g1b0e63c81b5/Modules/posixmodule.c#L4194-L4195
#
# which makes e.g. pytest to fail to work with
#
# $ gpython -m pytest -vsx
# ...
# File ".../lib/python3.11/pathlib.py", line 370, in _select_from
# if self.match(name):
# ^^^^^^^^^^^^^^^^
# TypeError: cannot use a string pattern on a bytes-like object
#
# In general adding buffer interface to ustr is believed to break too much
# compatibility with standard unicode on py3 that we decided against it.
#
# (*) memoryview(ustr) could return memoryview for bytes-representation of ustr.
x = xstr(xbytes('мир')+b'\xff', tx) # note: invalid utf-8
m = mview(x)
if (tx is ustr):
with raises(TypeError):
memoryview(x)
return
m = memoryview(x)
assert m.format == 'B'
assert m.itemsize == 1
assert m.ndim == 1
......@@ -2080,7 +2100,7 @@ if six.PY3:
@mark.parametrize('fmt', _)
def test_strings_capi_getargs_to_cstr(tx, fmt):
if six.PY2:
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 't_hash'):
# UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-3: ordinal not in range(128)
xfail("TODO: py2: PyArg_Parse(%s) vs ustr" % fmt)
......@@ -2095,8 +2115,9 @@ def test_strings_capi_getargs_to_cstr(tx, fmt):
# TODO we will try to handle this later
xfail("TODO: py3: PyArg_Parse(%s) vs bstr" % fmt)
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash'):
if tx is ustr and fmt in ('s', 's_star', 's_hash', 'z', 'z_star', 'z_hash', 'y', 'y_star', 'y_hash'):
# UnicodeEncodeError: 'utf-8' codec can't encode character '\udcff' in position 3: surrogates not allowed
# TypeError: a bytes-like object is required, not 'golang.ustr'
xfail("TODO: py3: PyArg_Parse(%s) vs ustr" % fmt)
bmirf = xbytes('мир') + b'\xff' # invalid UTF-8 to make sure conversion
......@@ -2766,6 +2787,10 @@ def test_strings_base64(tx):
#
# even if default encoding is utf-8 (gpython) the result is 0LzQuNGA7bO
xfail("TODO: py2: ustr -> default encoded bstr")
if six.PY3 and tx is ustr:
# PyObject_GetBuffer(u)
# -> TypeError: a bytes-like object is required, not 'golang.ustr'
xfail("TODO: py3: accept ustr in binascii.b2a_base64")
x = xstr(u'мир', tx) + b'\xff' ; assert type(x) is tx
assert base64.b64encode(x) == b'0LzQuNGA/w=='
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment