diff --git a/from_cpython/Objects/unicodeobject.c b/from_cpython/Objects/unicodeobject.c index 41a6f5c4608ceea4327bc3dd905cdc965c570526..3c7834964c8cf43feb96408dbd7789dc380cae6f 100644 --- a/from_cpython/Objects/unicodeobject.c +++ b/from_cpython/Objects/unicodeobject.c @@ -101,7 +101,7 @@ static PyUnicodeObject *free_list = NULL; static int numfree = 0; /* The empty Unicode object is shared to improve performance. */ -static PyUnicodeObject *unicode_empty = NULL; +PyUnicodeObject *unicode_empty = NULL; #define _Py_RETURN_UNICODE_EMPTY() \ do { \ @@ -317,76 +317,7 @@ int unicode_resize(register PyUnicodeObject *unicode, */ -static -PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) -{ - register PyUnicodeObject *unicode; - - /* Optimization for empty strings */ - if (length == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } - - /* Ensure we won't overflow the size. */ - if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { - return (PyUnicodeObject *)PyErr_NoMemory(); - } - - /* Unicode freelist & memory allocation */ - if (free_list) { - unicode = free_list; - free_list = *(PyUnicodeObject **)unicode; - numfree--; - if (unicode->str) { - /* Keep-Alive optimization: we only upsize the buffer, - never downsize it. */ - if ((unicode->length < length) && - unicode_resize(unicode, length) < 0) { - PyObject_DEL(unicode->str); - unicode->str = NULL; - } - } - else { - size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - PyObject_INIT(unicode, &PyUnicode_Type); - } - else { - size_t new_size; - unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); - if (unicode == NULL) - return NULL; - new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); - unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); - } - - if (!unicode->str) { - PyErr_NoMemory(); - goto onError; - } - /* Initialize the first element to guard against cases where - * the caller fails before initializing str -- unicode_resize() - * reads str[0], and the Keep-Alive optimization can keep memory - * allocated for str alive across a call to unicode_dealloc(unicode). - * We don't want unicode_resize to read uninitialized memory in - * that case. - */ - unicode->str[0] = 0; - unicode->str[length] = 0; - unicode->length = length; - unicode->hash = -1; - unicode->defenc = NULL; - return unicode; - - onError: - /* XXX UNREF/NEWREF interface should be more symmetrical */ - _Py_DEC_REFTOTAL; - _Py_ForgetReference((PyObject *)unicode); - PyObject_Del(unicode); - return NULL; -} +extern PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static void unicode_dealloc(register PyUnicodeObject *unicode) diff --git a/microbenchmarks/re_finditer_bench.py b/microbenchmarks/re_finditer_bench.py new file mode 100644 index 0000000000000000000000000000000000000000..9ad6c3654891cdf5dac70e06bff817eb3c6360cf --- /dev/null +++ b/microbenchmarks/re_finditer_bench.py @@ -0,0 +1,7 @@ +import re +def f(): + r = re.compile(" ") + u = "a b c d" + for i in xrange(2000000): + r.finditer(u) +f() diff --git a/microbenchmarks/unicode_split_ubench.py b/microbenchmarks/unicode_split_ubench.py new file mode 100644 index 0000000000000000000000000000000000000000..898e2b302a2072f9de826dfb09a61fef25376262 --- /dev/null +++ b/microbenchmarks/unicode_split_ubench.py @@ -0,0 +1,6 @@ +def f(): + u = "a b c d" + u2 = u" " + for i in xrange(4000000): + u.split(u2) +f() diff --git a/src/runtime/types.cpp b/src/runtime/types.cpp index 6a83f1b1a46f6e9e647b0a2b4f61945edfcca504..d0274d9dba16ade24c72be9b561c13843bc18d4c 100644 --- a/src/runtime/types.cpp +++ b/src/runtime/types.cpp @@ -2850,6 +2850,63 @@ out: return result; } +extern "C" PyUnicodeObject* unicode_empty; +extern "C" PyUnicodeObject* _PyUnicode_New(Py_ssize_t length) noexcept { + PyUnicodeObject* unicode; + + /* Optimization for empty strings */ + if (length == 0 && unicode_empty != NULL) { + Py_INCREF(unicode_empty); + return unicode_empty; + } + + /* Ensure we won't overflow the size. */ + if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { + return (PyUnicodeObject*)PyErr_NoMemory(); + } + + // Do a bunch of inlining + constant folding of this line of CPython's: + // unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); + assert(PyUnicode_Type.tp_basicsize == sizeof(PyUnicodeObject)); // use the compile-time constant + unicode = (PyUnicodeObject*)gc_alloc(sizeof(PyUnicodeObject), gc::GCKind::CONSERVATIVE_PYTHON); + if (unicode == NULL) + return (PyUnicodeObject*)PyErr_NoMemory(); + + // Inline PyObject_INIT: + assert(!PyType_SUPPORTS_WEAKREFS(&PyUnicode_Type)); + assert(!PyUnicode_Type.instancesHaveHCAttrs()); + assert(!PyUnicode_Type.instancesHaveDictAttrs()); + unicode->ob_type = (struct _typeobject*)&PyUnicode_Type; + + size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); + unicode->str = (Py_UNICODE*)PyMem_MALLOC(new_size); // why is this faster than gc_compat_malloc or gc_alloc?? + + if (!unicode->str) { + PyErr_NoMemory(); + goto onError; + } + /* Initialize the first element to guard against cases where + * the caller fails before initializing str -- unicode_resize() + * reads str[0], and the Keep-Alive optimization can keep memory + * allocated for str alive across a call to unicode_dealloc(unicode). + * We don't want unicode_resize to read uninitialized memory in + * that case. + */ + unicode->str[0] = 0; + unicode->str[length] = 0; + unicode->length = length; + unicode->hash = -1; + unicode->defenc = NULL; + return unicode; + +onError: + /* XXX UNREF/NEWREF interface should be more symmetrical */ + _Py_DEC_REFTOTAL; + _Py_ForgetReference((PyObject*)unicode); + PyObject_Del(unicode); + return NULL; +} + bool TRACK_ALLOCATIONS = false; void setupRuntime() {