strconv: Move functionality related to UTF8 encode/decode into _golang_str

- Move _utf8_decode_rune, _utf8_decode_surrogateescape, _utf8_encode_surrogateescape out from strconv into _golang_str - Factor _bstr/_ustr code into pyb/pyu. _bstr/_ustr become plain wrappers over pyb/pyu. - work-around emerged golang ↔ strconv dependency with at-runtime import. Moved routines belong to the main part of golang strings processing -> their home should be in _golang_str.pyx /reviewed-by @jerome /reviewed-at !18

strconv: Move functionality related to UTF8 encode/decode into _golang_str
- Move _utf8_decode_rune, _utf8_decode_surrogateescape, _utf8_encode_surrogateescape out from strconv into _golang_str - Factor _bstr/_ustr code into pyb/pyu. _bstr/_ustr become plain wrappers over pyb/pyu. - work-around emerged golang ↔ strconv dependency with at-runtime import. Moved routines belong to the main part of golang strings processing -> their home should be in _golang_str.pyx /reviewed-by @jerome /reviewed-at !18
50b8cb7e · Kirill Smelkov · e72a459f · 50b8cb7e · 50b8cb7e · 50b8cb7e
Commit 50b8cb7e authored Oct 03, 2022 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 202 additions and 198 deletions

golang/__init__.py golang/__init__.py +8 -0

golang/_golang_str.pyx golang/_golang_str.pyx +187 -5

golang/strconv.py golang/strconv.py +7 -193

No files found.
--- a/golang/__init__.py
+++ b/golang/__init__.py
@@ -317,3 +317,11 @@ from ._golang import    \
    pyerror     as error,   \
    pyb         as b,       \
    pyu         as u
+# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency
+def _():
+    from . import _golang
+    from . import strconv
+    _golang.pystrconv = strconv
+_()
+del _
--- a/golang/_golang_str.pyx
+++ b/golang/_golang_str.pyx
@@ -22,7 +22,7 @@
 It is included from _golang.pyx .
 """
-from golang import strconv as pystrconv
+pystrconv = None  # = golang.strconv imported at runtime (see __init__.py)
 def pyb(s): # -> bytes
    """b converts str/unicode/bytes s to UTF-8 encoded bytestring.
@@ -40,8 +40,20 @@ def pyb(s): # -> bytes
       See also: u.
    """
-    bs, _ = pystrconv._bstr(s)
+    if isinstance(s, bytes):                    # py2: str      py3: bytes
-    return bs
+        pass
+    elif isinstance(s, unicode):                # py2: unicode  py3: str
+        if PY_MAJOR_VERSION >= 3:
+            s = s.encode('UTF-8', 'surrogateescape')
+        else:
+            # py2 does not have surrogateescape error handler, and even if we
+            # provide one, builtin unicode.encode() does not treat
+            # \udc80-\udcff as error. -> Do the encoding ourselves.
+            s = _utf8_encode_surrogateescape(s)
+    else:
+        raise TypeError("b: invalid type %s" % type(s))
+    return s
 def pyu(s): # -> unicode
    """u converts str/unicode/bytes s to unicode string.
@@ -61,8 +73,20 @@ def pyu(s): # -> unicode
       See also: b.
    """
-    us, _ = pystrconv._ustr(s)
+    if isinstance(s, unicode):                  # py2: unicode  py3: str
-    return us
+        pass
+    elif isinstance(s, bytes):                  # py2: str      py3: bytes
+        if PY_MAJOR_VERSION >= 3:
+            s = s.decode('UTF-8', 'surrogateescape')
+        else:
+            # py2 does not have surrogateescape error handler, and even if we
+            # provide one, builtin bytes.decode() does not treat surrogate
+            # sequences as error. -> Do the decoding ourselves.
+            s = _utf8_decode_surrogateescape(s)
+    else:
+        raise TypeError("u: invalid type %s" % type(s))
+    return s
 # __pystr converts obj to str of current python:
@@ -168,3 +192,161 @@ def pyqq(obj):
        qobj = _pystr(pyb(qobj))
    return qobj
+# ---- UTF-8 encode/decode ----
+from six import unichr                      # py2: unichr       py3: chr
+from six import int2byte as bchr            # py2: chr          py3: lambda x: bytes((x,))
+_rune_error = 0xFFFD # unicode replacement character
+_ucs2_build        = (sys.maxunicode ==     0xffff)     #    ucs2
+assert _ucs2_build or sys.maxunicode >= 0x0010ffff      # or ucs4
+# _utf8_decode_rune decodes next UTF8-character from byte string s.
+#
+# _utf8_decode_rune(s) -> (r, size)
+def _utf8_decode_rune(s):
+    assert isinstance(s, bytes)
+    if len(s) == 0:
+        return _rune_error, 0
+    l = min(len(s), 4)  # max size of an UTF-8 encoded character
+    while l > 0:
+        try:
+            r = s[:l].decode('utf-8', 'strict')
+        except UnicodeDecodeError:
+            l -= 1
+            continue
+        if len(r) == 1:
+            return ord(r), l
+        # see comment in _utf8_encode_surrogateescape
+        if _ucs2_build and len(r) == 2:
+            try:
+                return _xuniord(r), l
+            # e.g. TypeError: ord() expected a character, but string of length 2 found
+            except TypeError:
+                l -= 1
+                continue
+        l -= 1
+        continue
+    # invalid UTF-8
+    return _rune_error, 1
+# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
+def _utf8_decode_surrogateescape(s): # -> unicode
+    assert isinstance(s, bytes)
+    outv = []
+    emit = outv.append
+    while len(s) > 0:
+        r, width = _utf8_decode_rune(s)
+        if r == _rune_error:
+            b = ord(s[0])
+            assert 0x80 <= b <= 0xff
+            emit(unichr(0xdc00 + b))
+        # python2 "correctly" decodes surrogates - don't allow that as
+        # surrogates are not valid UTF-8:
+        # https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
+        # (python3 raises UnicodeDecodeError for surrogates)
+        elif 0xd800 <= r < 0xdfff:
+            for c in s[:width]:
+                b = ord(c)
+                if c >= 0x80:
+                    emit(unichr(0xdc00 + b))
+                else:
+                    emit(unichr(b))
+        else:
+            emit(_xunichr(r))
+        s = s[width:]
+    return u''.join(outv)
+# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
+def _utf8_encode_surrogateescape(s): # -> bytes
+    assert isinstance(s, unicode)
+    outv = []
+    emit = outv.append
+    while len(s) > 0:
+        uc = s[0]; s = s[1:]
+        c = ord(uc)
+        if 0xdc80 <= c <= 0xdcff:
+            # surrogate - emit unescaped byte
+            emit(bchr(c & 0xff))
+            continue
+        # in builds with --enable-unicode=ucs2 (default for py2 on macos and windows)
+        # python represents unicode points > 0xffff as _two_ unicode characters:
+        #
+        #   uh = u - 0x10000
+        #   c1 = 0xd800 + (uh >> 10)      ; [d800, dbff]
+        #   c2 = 0xdc00 + (uh & 0x3ff)    ; [dc00, dfff]
+        #
+        # if detected - merge those two unicode characters for .encode('utf-8') below
+        #
+        # this should be only relevant for python2, as python3 switched to "flexible"
+        # internal unicode representation: https://www.python.org/dev/peps/pep-0393
+        if _ucs2_build and (0xd800 <= c <= 0xdbff):
+            if len(s) > 0:
+                uc2 = s[0]
+                c2 = ord(uc2)
+                if 0xdc00 <= c2 <= 0xdfff:
+                    uc = uc + uc2
+                    s = s[1:]
+        emit(uc.encode('utf-8', 'strict'))
+    return b''.join(outv)
+# _xuniord returns ordinal for a unicode character u.
+#
+# it works correctly even if u is represented as 2 unicode surrogate points on
+# ucs2 python build.
+if not _ucs2_build:
+    _xuniord = ord
+else:
+    def _xuniord(u):
+        assert isinstance(u, unicode)
+        if len(u) == 1:
+            return ord(u)
+        # see _utf8_encode_surrogateescape for details
+        if len(u) == 2:
+            c1 = ord(u[0])
+            c2 = ord(u[1])
+            if (0xd800 <= c1 <= 0xdbff) and (0xdc00 <= c2 <= 0xdfff):
+                return 0x10000 | ((c1 - 0xd800) << 10) | (c2 - 0xdc00)
+        # let it crash
+        return ord(u)
+# _xunichr returns unicode character for an ordinal i.
+#
+# it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are
+# represented as 2 unicode pointe.
+if not _ucs2_build:
+    _xunichr = unichr
+else:
+    def _xunichr(i):
+        if i < 0x10000:
+            return unichr(i)
+        # see _utf8_encode_surrogateescape for details
+        uh = i - 0x10000
+        return unichr(0xd800 + (uh >> 10)) + \
+               unichr(0xdc00 + (uh & 0x3ff))
--- a/golang/strconv.py
+++ b/golang/strconv.py
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2021  Nexedi SA and Contributors.
+# Copyright (C) 2018-2022  Nexedi SA and Contributors.
 #                          Kirill Smelkov <kirr@nexedi.com>
 #
 # This program is free software: you can Use, Study, Modify and Redistribute
@@ -21,55 +21,21 @@
 from __future__ import print_function, absolute_import
-import sys
+import unicodedata, codecs
-import six, unicodedata, codecs
 from six import text_type as unicode        # py2: unicode      py3: str
-from six import unichr                      # py2: unichr       py3: chr
-from six import int2byte as bchr            # py2: chr          py3: lambda x: bytes((x,))
 from six.moves import range as xrange
+from golang import b, u
+from golang._golang import _utf8_decode_rune, _rune_error, _xunichr
 # _bstr is like b but also returns whether input was unicode.
 def _bstr(s):   # -> sbytes, wasunicode
-    wasunicode = False
+    return b(s), isinstance(s, unicode)
-    if isinstance(s, bytes):                    # py2: str      py3: bytes
-        pass
-    elif isinstance(s, unicode):                # py2: unicode  py3: str
-        wasunicode = True
-    else:
-        raise TypeError("b: invalid type %s" % type(s))
-    if wasunicode:                              # py2: unicode  py3: str
-        if six.PY3:
-            s = s.encode('UTF-8', 'surrogateescape')
-        else:
-            # py2 does not have surrogateescape error handler, and even if we
-            # provide one, builtin unicode.encode() does not treat
-            # \udc80-\udcff as error. -> Do the encoding ourselves.
-            s = _utf8_encode_surrogateescape(s)
-    return s, wasunicode
 # _ustr is like u but also returns whether input was bytes.
 def _ustr(s):   # -> sunicode, wasbytes
-    wasbytes = True
+    return u(s), isinstance(s, bytes)
-    if isinstance(s, bytes):                    # py2: str      py3: bytes
-        pass
-    elif isinstance(s, unicode):                # py2: unicode  py3: str
-        wasbytes = False
-    else:
-        raise TypeError("u: invalid type %s" % type(s))
-    if wasbytes:
-        if six.PY3:
-            s = s.decode('UTF-8', 'surrogateescape')
-        else:
-            # py2 does not have surrogateescape error handler, and even if we
-            # provide one, builtin bytes.decode() does not treat surrogate
-            # sequences as error. -> Do the decoding ourselves.
-            s = _utf8_decode_surrogateescape(s)
-    return s, wasbytes
 # quote quotes unicode|bytes string into valid "..." unicode|bytes string always quoted with ".
@@ -226,155 +192,3 @@ def _unquote_next(s):
 _printable_cat0 = frozenset(['L', 'N', 'P', 'S'])   # letters, numbers, punctuation, symbols
-_rune_error = 0xFFFD # unicode replacement character
-_ucs2_build        = (sys.maxunicode ==     0xffff)     #    ucs2
-assert _ucs2_build or sys.maxunicode >= 0x0010ffff      # or ucs4
-# _utf8_decode_rune decodes next UTF8-character from byte string s.
-#
-# _utf8_decode_rune(s) -> (r, size)
-def _utf8_decode_rune(s):
-    assert isinstance(s, bytes)
-    if len(s) == 0:
-        return _rune_error, 0
-    l = min(len(s), 4)  # max size of an UTF-8 encoded character
-    while l > 0:
-        try:
-            r = s[:l].decode('utf-8', 'strict')
-        except UnicodeDecodeError:
-            l -= 1
-            continue
-        if len(r) == 1:
-            return ord(r), l
-        # see comment in _utf8_encode_surrogateescape
-        if _ucs2_build and len(r) == 2:
-            try:
-                return _xuniord(r), l
-            # e.g. TypeError: ord() expected a character, but string of length 2 found
-            except TypeError:
-                l -= 1
-                continue
-        l -= 1
-        continue
-    # invalid UTF-8
-    return _rune_error, 1
-# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
-def _utf8_decode_surrogateescape(s): # -> unicode
-    assert isinstance(s, bytes)
-    outv = []
-    emit = outv.append
-    while len(s) > 0:
-        r, width = _utf8_decode_rune(s)
-        if r == _rune_error:
-            b = ord(s[0])
-            assert 0x80 <= b <= 0xff
-            emit(unichr(0xdc00 + b))
-        # python2 "correctly" decodes surrogates - don't allow that as
-        # surrogates are not valid UTF-8:
-        # https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
-        # (python3 raises UnicodeDecodeError for surrogates)
-        elif 0xd800 <= r < 0xdfff:
-            for c in s[:width]:
-                b = ord(c)
-                if c >= 0x80:
-                    emit(unichr(0xdc00 + b))
-                else:
-                    emit(unichr(b))
-        else:
-            emit(_xunichr(r))
-        s = s[width:]
-    return u''.join(outv)
-# _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3.
-def _utf8_encode_surrogateescape(s): # -> bytes
-    assert isinstance(s, unicode)
-    outv = []
-    emit = outv.append
-    while len(s) > 0:
-        uc = s[0]; s = s[1:]
-        c = ord(uc)
-        if 0xdc80 <= c <= 0xdcff:
-            # surrogate - emit unescaped byte
-            emit(bchr(c & 0xff))
-            continue
-        # in builds with --enable-unicode=ucs2 (default for py2 on macos and windows)
-        # python represents unicode points > 0xffff as _two_ unicode characters:
-        #
-        #   uh = u - 0x10000
-        #   c1 = 0xd800 + (uh >> 10)      ; [d800, dbff]
-        #   c2 = 0xdc00 + (uh & 0x3ff)    ; [dc00, dfff]
-        #
-        # if detected - merge those two unicode characters for .encode('utf-8') below
-        #
-        # this should be only relevant for python2, as python3 switched to "flexible"
-        # internal unicode representation: https://www.python.org/dev/peps/pep-0393
-        if _ucs2_build and (0xd800 <= c <= 0xdbff):
-            if len(s) > 0:
-                uc2 = s[0]
-                c2 = ord(uc2)
-                if 0xdc00 <= c2 <= 0xdfff:
-                    uc = uc + uc2
-                    s = s[1:]
-        emit(uc.encode('utf-8', 'strict'))
-    return b''.join(outv)
-# _xuniord returns ordinal for a unicode character u.
-#
-# it works correctly even if u is represented as 2 unicode surrogate points on
-# ucs2 python build.
-if not _ucs2_build:
-    _xuniord = ord
-else:
-    def _xuniord(u):
-        assert isinstance(u, unicode)
-        if len(u) == 1:
-            return ord(u)
-        # see _utf8_encode_surrogateescape for details
-        if len(u) == 2:
-            c1 = ord(u[0])
-            c2 = ord(u[1])
-            if (0xd800 <= c1 <= 0xdbff) and (0xdc00 <= c2 <= 0xdfff):
-                return 0x10000 | ((c1 - 0xd800) << 10) | (c2 - 0xdc00)
-        # let it crash
-        return ord(u)
-# _xunichr returns unicode character for an ordinal i.
-#
-# it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are
-# represented as 2 unicode pointe.
-if not _ucs2_build:
-    _xunichr = unichr
-else:
-    def _xunichr(i):
-        if i < 0x10000:
-            return unichr(i)
-        # see _utf8_encode_surrogateescape for details
-        uh = i - 0x10000
-        return unichr(0xd800 + (uh >> 10)) + \
-               unichr(0xdc00 + (uh & 0x3ff))