// Copyright (c) 2014-2015 Dropbox, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include <algorithm> #include <cstring> #include <sstream> #include <unordered_map> #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/raw_ostream.h" #include "Python.h" #include "capi/typeobject.h" #include "capi/types.h" #include "core/common.h" #include "core/types.h" #include "core/util.h" #include "gc/collector.h" #include "runtime/dict.h" #include "runtime/long.h" #include "runtime/objmodel.h" #include "runtime/types.h" #include "runtime/util.h" extern "C" PyObject* string_count(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_join(PyStringObject* self, PyObject* orig) noexcept; extern "C" PyObject* string_split(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_rsplit(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_find(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_index(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_rindex(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_rfind(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string_splitlines(PyStringObject* self, PyObject* args) noexcept; extern "C" PyObject* string__format__(PyObject* self, PyObject* args) noexcept; // from cpython's stringobject.c: #define LEFTSTRIP 0 #define RIGHTSTRIP 1 #define BOTHSTRIP 2 namespace pyston { BoxedString* EmptyString; BoxedString* characters[UCHAR_MAX + 1]; BoxedString::BoxedString(const char* s, size_t n) : interned_state(SSTATE_NOT_INTERNED) { assert(s); RELEASE_ASSERT(n != llvm::StringRef::npos, ""); memmove(data(), s, n); data()[n] = 0; } BoxedString::BoxedString(llvm::StringRef lhs, llvm::StringRef rhs) : interned_state(SSTATE_NOT_INTERNED) { RELEASE_ASSERT(lhs.size() + rhs.size() != llvm::StringRef::npos, ""); memmove(data(), lhs.data(), lhs.size()); memmove(data() + lhs.size(), rhs.data(), rhs.size()); data()[lhs.size() + rhs.size()] = 0; } BoxedString::BoxedString(llvm::StringRef s) : interned_state(SSTATE_NOT_INTERNED) { RELEASE_ASSERT(s.size() != llvm::StringRef::npos, ""); memmove(data(), s.data(), s.size()); data()[s.size()] = 0; } BoxedString::BoxedString(size_t n, char c) : interned_state(SSTATE_NOT_INTERNED) { RELEASE_ASSERT(n != llvm::StringRef::npos, ""); memset(data(), c, n); data()[n] = 0; } BoxedString::BoxedString(size_t n) : interned_state(SSTATE_NOT_INTERNED) { RELEASE_ASSERT(n != llvm::StringRef::npos, ""); // Note: no memset. add the null-terminator for good measure though // (CPython does the same thing). data()[n] = 0; } extern "C" char PyString_GetItem(PyObject* op, ssize_t n) noexcept { RELEASE_ASSERT(PyString_Check(op), ""); return static_cast<const BoxedString*>(op)->s()[n]; } extern "C" PyObject* PyString_FromFormatV(const char* format, va_list vargs) noexcept { va_list count; Py_ssize_t n = 0; const char* f; char* s; PyObject* string; #ifdef VA_LIST_IS_ARRAY Py_MEMCPY(count, vargs, sizeof(va_list)); #else #ifdef __va_copy __va_copy(count, vargs); #else count = vargs; #endif #endif /* step 1: figure out how large a buffer we need */ for (f = format; *f; f++) { if (*f == '%') { #ifdef HAVE_LONG_LONG int longlongflag = 0; #endif const char* p = f; while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) ; /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since * they don't affect the amount of space we reserve. */ if (*f == 'l') { if (f[1] == 'd' || f[1] == 'u') { ++f; } #ifdef HAVE_LONG_LONG else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u')) { longlongflag = 1; f += 2; } #endif } else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { ++f; } switch (*f) { case 'c': (void)va_arg(count, int); /* fall through... */ case '%': n++; break; case 'd': case 'u': case 'i': case 'x': (void)va_arg(count, int); #ifdef HAVE_LONG_LONG /* Need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, plus 1 for the sign. 53/22 is an upper bound for log10(256). */ if (longlongflag) n += 2 + (SIZEOF_LONG_LONG * 53 - 1) / 22; else #endif /* 20 bytes is enough to hold a 64-bit integer. Decimal takes the most space. This isn't enough for octal. */ n += 20; break; case 's': s = va_arg(count, char*); n += strlen(s); break; case 'p': (void)va_arg(count, int); /* maximum 64-bit pointer representation: * 0xffffffffffffffff * so 19 characters is enough. * XXX I count 18 -- what's the extra for? */ n += 19; break; default: /* if we stumble upon an unknown formatting code, copy the rest of the format string to the output string. (we cannot just skip the code, since there's no way to know what's in the argument list) */ n += strlen(p); goto expand; } } else n++; } expand: /* step 2: fill the buffer */ /* Since we've analyzed how much space we need for the worst case, use sprintf directly instead of the slower PyOS_snprintf. */ string = PyString_FromStringAndSize(NULL, n); if (!string) return NULL; s = PyString_AsString(string); for (f = format; *f; f++) { if (*f == '%') { const char* p = f++; Py_ssize_t i; int longflag = 0; #ifdef HAVE_LONG_LONG int longlongflag = 0; #endif int size_tflag = 0; /* parse the width.precision part (we're only interested in the precision value, if any) */ n = 0; while (isdigit(Py_CHARMASK(*f))) n = (n * 10) + *f++ - '0'; if (*f == '.') { f++; n = 0; while (isdigit(Py_CHARMASK(*f))) n = (n * 10) + *f++ - '0'; } while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f))) f++; /* Handle %ld, %lu, %lld and %llu. */ if (*f == 'l') { if (f[1] == 'd' || f[1] == 'u') { longflag = 1; ++f; } #ifdef HAVE_LONG_LONG else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u')) { longlongflag = 1; f += 2; } #endif } /* handle the size_t flag. */ else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { size_tflag = 1; ++f; } switch (*f) { case 'c': *s++ = va_arg(vargs, int); break; case 'd': if (longflag) sprintf(s, "%ld", va_arg(vargs, long)); #ifdef HAVE_LONG_LONG else if (longlongflag) sprintf(s, "%" PY_FORMAT_LONG_LONG "d", va_arg(vargs, PY_LONG_LONG)); #endif else if (size_tflag) sprintf(s, "%" PY_FORMAT_SIZE_T "d", va_arg(vargs, Py_ssize_t)); else sprintf(s, "%d", va_arg(vargs, int)); s += strlen(s); break; case 'u': if (longflag) sprintf(s, "%lu", va_arg(vargs, unsigned long)); #ifdef HAVE_LONG_LONG else if (longlongflag) sprintf(s, "%" PY_FORMAT_LONG_LONG "u", va_arg(vargs, PY_LONG_LONG)); #endif else if (size_tflag) sprintf(s, "%" PY_FORMAT_SIZE_T "u", va_arg(vargs, size_t)); else sprintf(s, "%u", va_arg(vargs, unsigned int)); s += strlen(s); break; case 'i': sprintf(s, "%i", va_arg(vargs, int)); s += strlen(s); break; case 'x': sprintf(s, "%x", va_arg(vargs, int)); s += strlen(s); break; case 's': p = va_arg(vargs, char*); i = strlen(p); if (n > 0 && i > n) i = n; Py_MEMCPY(s, p, i); s += i; break; case 'p': sprintf(s, "%p", va_arg(vargs, void*)); /* %p is ill-defined: ensure leading 0x. */ if (s[1] == 'X') s[1] = 'x'; else if (s[1] != 'x') { memmove(s + 2, s, strlen(s) + 1); s[0] = '0'; s[1] = 'x'; } s += strlen(s); break; case '%': *s++ = '%'; break; default: strcpy(s, p); s += strlen(s); goto end; } } else *s++ = *f; } end: if (_PyString_Resize(&string, s - PyString_AS_STRING(string))) return NULL; return string; } extern "C" PyObject* PyString_FromFormat(const char* format, ...) noexcept { PyObject* ret; va_list vargs; #ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); #else va_start(vargs); #endif ret = PyString_FromFormatV(format, vargs); va_end(vargs); return ret; } extern "C" Box* strAdd(BoxedString* lhs, Box* _rhs) { assert(PyString_Check(lhs)); if (isSubclass(_rhs->cls, unicode_cls)) { Box* rtn = PyUnicode_Concat(lhs, _rhs); checkAndThrowCAPIException(); return rtn; } if (!PyString_Check(_rhs)) { if (PyByteArray_Check(_rhs)) { Box* rtn = PyByteArray_Concat(lhs, _rhs); checkAndThrowCAPIException(); return rtn; } else { // Note: this is deliberately not returning NotImplemented, even though // that would be more usual. I assume this behavior of CPython's is // for backwards compatibility. raiseExcHelper(TypeError, "cannot concatenate 'str' and '%s' objects", getTypeName(_rhs)); } } BoxedString* rhs = static_cast<BoxedString*>(_rhs); return new (lhs->size() + rhs->size()) BoxedString(lhs->s(), rhs->s()); } static llvm::StringMap<BoxedString*> interned_strings; static StatCounter num_interned_strings("num_interned_string"); extern "C" PyObject* PyString_InternFromString(const char* s) noexcept { RELEASE_ASSERT(s, ""); return internStringImmortal(s); } BoxedString* internStringImmortal(llvm::StringRef s) { auto& entry = interned_strings[s]; if (!entry) { num_interned_strings.log(); entry = (BoxedString*)PyGC_AddRoot(boxString(s)); // CPython returns mortal but in our current implementation they are inmortal entry->interned_state = SSTATE_INTERNED_IMMORTAL; } return entry; } extern "C" void PyString_InternInPlace(PyObject** p) noexcept { BoxedString* s = (BoxedString*)*p; if (s == NULL || !PyString_Check(s)) Py_FatalError("PyString_InternInPlace: strings only please!"); /* If it's a string subclass, we don't really know what putting it in the interned dict might do. */ if (!PyString_CheckExact(s)) return; if (PyString_CHECK_INTERNED(s)) return; auto& entry = interned_strings[s->s()]; if (entry) *p = entry; else { num_interned_strings.log(); entry = (BoxedString*)PyGC_AddRoot(s); // CPython returns mortal but in our current implementation they are inmortal s->interned_state = SSTATE_INTERNED_IMMORTAL; } } extern "C" int _PyString_CheckInterned(PyObject* p) noexcept { RELEASE_ASSERT(PyString_Check(p), ""); BoxedString* s = (BoxedString*)p; return s->interned_state; } /* Format codes * F_LJUST '-' * F_SIGN '+' * F_BLANK ' ' * F_ALT '#' * F_ZERO '0' */ #define F_LJUST (1 << 0) #define F_SIGN (1 << 1) #define F_BLANK (1 << 2) #define F_ALT (1 << 3) #define F_ZERO (1 << 4) Py_LOCAL_INLINE(PyObject*) getnextarg(PyObject* args, Py_ssize_t arglen, Py_ssize_t* p_argidx) { Py_ssize_t argidx = *p_argidx; if (argidx < arglen) { (*p_argidx)++; if (arglen < 0) return args; else return PyTuple_GetItem(args, argidx); } PyErr_SetString(PyExc_TypeError, "not enough arguments for format string"); return NULL; } extern "C" PyObject* _PyString_FormatLong(PyObject* val, int flags, int prec, int type, const char** pbuf, int* plen) noexcept { PyObject* result = NULL; char* buf; Py_ssize_t i; int sign; /* 1 if '-', else 0 */ int len; /* number of characters */ Py_ssize_t llen; int numdigits; /* len == numnondigits + numdigits */ int numnondigits = 0; switch (type) { case 'd': case 'u': result = Py_TYPE(val)->tp_str(val); break; case 'o': result = Py_TYPE(val)->tp_as_number->nb_oct(val); break; case 'x': case 'X': numnondigits = 2; result = Py_TYPE(val)->tp_as_number->nb_hex(val); break; default: assert(!"'type' not in [duoxX]"); } if (!result) return NULL; buf = PyString_AsString(result); if (!buf) { Py_DECREF(result); return NULL; } /* To modify the string in-place, there can only be one reference. */ // Pyston change: // if (Py_REFCNT(result) != 1) { if (0) { PyErr_BadInternalCall(); return NULL; } llen = PyString_Size(result); if (llen > INT_MAX) { PyErr_SetString(PyExc_ValueError, "string too large in _PyString_FormatLong"); return NULL; } len = (int)llen; if (buf[len - 1] == 'L') { --len; buf[len] = '\0'; } sign = buf[0] == '-'; numnondigits += sign; numdigits = len - numnondigits; assert(numdigits > 0); /* Get rid of base marker unless F_ALT */ if ((flags & F_ALT) == 0) { /* Need to skip 0x, 0X or 0. */ int skipped = 0; switch (type) { case 'o': assert(buf[sign] == '0'); /* If 0 is only digit, leave it alone. */ if (numdigits > 1) { skipped = 1; --numdigits; } break; case 'x': case 'X': assert(buf[sign] == '0'); assert(buf[sign + 1] == 'x'); skipped = 2; numnondigits -= 2; break; } if (skipped) { buf += skipped; len -= skipped; if (sign) buf[0] = '-'; } assert(len == numnondigits + numdigits); assert(numdigits > 0); } /* Fill with leading zeroes to meet minimum width. */ if (prec > numdigits) { PyObject* r1 = PyString_FromStringAndSize(NULL, numnondigits + prec); char* b1; if (!r1) { Py_DECREF(result); return NULL; } b1 = PyString_AS_STRING(r1); for (i = 0; i < numnondigits; ++i) *b1++ = *buf++; for (i = 0; i < prec - numdigits; i++) *b1++ = '0'; for (i = 0; i < numdigits; i++) *b1++ = *buf++; *b1 = '\0'; Py_DECREF(result); result = r1; buf = PyString_AS_STRING(result); len = numnondigits + prec; } /* Fix up case for hex conversions. */ if (type == 'X') { /* Need to convert all lower case letters to upper case. and need to convert 0x to 0X (and -0x to -0X). */ for (i = 0; i < len; i++) if (buf[i] >= 'a' && buf[i] <= 'x') buf[i] -= 'a' - 'A'; } *pbuf = buf; *plen = len; return result; } static PyObject* formatfloat(PyObject* v, int flags, int prec, int type) { char* p; PyObject* result; double x; x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) { PyErr_Format(PyExc_TypeError, "float argument required, " "not %.200s", Py_TYPE(v)->tp_name); return NULL; } if (prec < 0) prec = 6; p = PyOS_double_to_string(x, type, prec, (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); if (p == NULL) return NULL; result = PyString_FromStringAndSize(p, strlen(p)); PyMem_Free(p); return result; } Py_LOCAL_INLINE(int) formatint(char* buf, size_t buflen, int flags, int prec, int type, PyObject* v) { /* fmt = '%#.' + `prec` + 'l' + `type` worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) + 1 + 1 = 24 */ char fmt[64]; /* plenty big enough! */ const char* sign; long x; x = PyInt_AsLong(v); if (x == -1 && PyErr_Occurred()) { PyErr_Format(PyExc_TypeError, "int argument required, not %.200s", Py_TYPE(v)->tp_name); return -1; } if (x < 0 && type == 'u') { type = 'd'; } if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) sign = "-"; else sign = ""; if (prec < 0) prec = 1; if ((flags & F_ALT) && (type == 'x' || type == 'X')) { /* When converting under %#x or %#X, there are a number * of issues that cause pain: * - when 0 is being converted, the C standard leaves off * the '0x' or '0X', which is inconsistent with other * %#x/%#X conversions and inconsistent with Python's * hex() function * - there are platforms that violate the standard and * convert 0 with the '0x' or '0X' * (Metrowerks, Compaq Tru64) * - there are platforms that give '0x' when converting * under %#X, but convert 0 in accordance with the * standard (OS/2 EMX) * * We can achieve the desired consistency by inserting our * own '0x' or '0X' prefix, and substituting %x/%X in place * of %#x/%#X. * * Note that this is the same approach as used in * formatint() in unicodeobject.c */ PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", sign, type, prec, type); } else { PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", sign, (flags & F_ALT) ? "#" : "", prec, type); } /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 */ if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { PyErr_SetString(PyExc_OverflowError, "formatted integer is too long (precision too large?)"); return -1; } if (sign[0]) PyOS_snprintf(buf, buflen, fmt, -x); else PyOS_snprintf(buf, buflen, fmt, x); return (int)strlen(buf); } Py_LOCAL_INLINE(int) formatchar(char* buf, size_t buflen, PyObject* v) { /* presume that the buffer is at least 2 characters long */ if (PyString_Check(v)) { if (!PyArg_Parse(v, "c;%c requires int or char", &buf[0])) return -1; } else { if (!PyArg_Parse(v, "b;%c requires int or char", &buf[0])) return -1; } buf[1] = '\0'; return 1; } #define FORMATBUFLEN (size_t)120 extern "C" PyObject* PyString_Format(PyObject* format, PyObject* args) noexcept { char* fmt, *res; Py_ssize_t arglen, argidx; Py_ssize_t reslen, rescnt, fmtcnt; int args_owned = 0; PyObject* result, *orig_args; #ifdef Py_USING_UNICODE PyObject* v, *w; #endif PyObject* dict = NULL; if (format == NULL || !PyString_Check(format) || args == NULL) { PyErr_BadInternalCall(); return NULL; } orig_args = args; fmt = PyString_AS_STRING(format); fmtcnt = PyString_GET_SIZE(format); reslen = rescnt = fmtcnt + 100; result = PyString_FromStringAndSize((char*)NULL, reslen); if (result == NULL) return NULL; res = PyString_AsString(result); if (PyTuple_Check(args)) { arglen = PyTuple_GET_SIZE(args); argidx = 0; } else { arglen = -1; argidx = -2; } if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript && !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type)) dict = args; while (--fmtcnt >= 0) { if (*fmt != '%') { if (--rescnt < 0) { rescnt = fmtcnt + 100; reslen += rescnt; if (_PyString_Resize(&result, reslen)) return NULL; res = PyString_AS_STRING(result) + reslen - rescnt; --rescnt; } *res++ = *fmt++; } else { /* Got a format specifier */ int flags = 0; Py_ssize_t width = -1; int prec = -1; int c = '\0'; int fill; int isnumok; PyObject* v = NULL; PyObject* temp = NULL; const char* pbuf; int sign; Py_ssize_t len; char formatbuf[FORMATBUFLEN]; /* For format{int,char}() */ #ifdef Py_USING_UNICODE char* fmt_start = fmt; Py_ssize_t argidx_start = argidx; #endif fmt++; if (*fmt == '(') { char* keystart; Py_ssize_t keylen; PyObject* key; int pcount = 1; if (dict == NULL) { PyErr_SetString(PyExc_TypeError, "format requires a mapping"); goto error; } ++fmt; --fmtcnt; keystart = fmt; /* Skip over balanced parentheses */ while (pcount > 0 && --fmtcnt >= 0) { if (*fmt == ')') --pcount; else if (*fmt == '(') ++pcount; fmt++; } keylen = fmt - keystart - 1; if (fmtcnt < 0 || pcount > 0) { PyErr_SetString(PyExc_ValueError, "incomplete format key"); goto error; } key = PyString_FromStringAndSize(keystart, keylen); if (key == NULL) goto error; if (args_owned) { Py_DECREF(args); args_owned = 0; } args = PyObject_GetItem(dict, key); Py_DECREF(key); if (args == NULL) { goto error; } args_owned = 1; arglen = -1; argidx = -2; } while (--fmtcnt >= 0) { switch (c = *fmt++) { case '-': flags |= F_LJUST; continue; case '+': flags |= F_SIGN; continue; case ' ': flags |= F_BLANK; continue; case '#': flags |= F_ALT; continue; case '0': flags |= F_ZERO; continue; } break; } if (c == '*') { v = getnextarg(args, arglen, &argidx); if (v == NULL) goto error; if (!PyInt_Check(v)) { PyErr_SetString(PyExc_TypeError, "* wants int"); goto error; } width = PyInt_AsSsize_t(v); if (width == -1 && PyErr_Occurred()) goto error; if (width < 0) { flags |= F_LJUST; width = -width; } if (--fmtcnt >= 0) c = *fmt++; } else if (c >= 0 && isdigit(c)) { width = c - '0'; while (--fmtcnt >= 0) { c = Py_CHARMASK(*fmt++); if (!isdigit(c)) break; if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { PyErr_SetString(PyExc_ValueError, "width too big"); goto error; } width = width * 10 + (c - '0'); } } if (c == '.') { prec = 0; if (--fmtcnt >= 0) c = *fmt++; if (c == '*') { v = getnextarg(args, arglen, &argidx); if (v == NULL) goto error; if (!PyInt_Check(v)) { PyErr_SetString(PyExc_TypeError, "* wants int"); goto error; } prec = _PyInt_AsInt(v); if (prec == -1 && PyErr_Occurred()) goto error; if (prec < 0) prec = 0; if (--fmtcnt >= 0) c = *fmt++; } else if (c >= 0 && isdigit(c)) { prec = c - '0'; while (--fmtcnt >= 0) { c = Py_CHARMASK(*fmt++); if (!isdigit(c)) break; if (prec > (INT_MAX - ((int)c - '0')) / 10) { PyErr_SetString(PyExc_ValueError, "prec too big"); goto error; } prec = prec * 10 + (c - '0'); } } } /* prec */ if (fmtcnt >= 0) { if (c == 'h' || c == 'l' || c == 'L') { if (--fmtcnt >= 0) c = *fmt++; } } if (fmtcnt < 0) { PyErr_SetString(PyExc_ValueError, "incomplete format"); goto error; } if (c != '%') { v = getnextarg(args, arglen, &argidx); if (v == NULL) goto error; } sign = 0; fill = ' '; switch (c) { case '%': pbuf = "%"; len = 1; break; case 's': #ifdef Py_USING_UNICODE if (PyUnicode_Check(v)) { fmt = fmt_start; argidx = argidx_start; goto unicode; } #endif temp = _PyObject_Str(v); #ifdef Py_USING_UNICODE if (temp != NULL && PyUnicode_Check(temp)) { Py_DECREF(temp); fmt = fmt_start; argidx = argidx_start; goto unicode; } #endif /* Fall through */ case 'r': if (c == 'r') temp = PyObject_Repr(v); if (temp == NULL) goto error; if (!PyString_Check(temp)) { PyErr_SetString(PyExc_TypeError, "%s argument has non-string str()"); Py_DECREF(temp); goto error; } pbuf = PyString_AS_STRING(temp); len = PyString_GET_SIZE(temp); if (prec >= 0 && len > prec) len = prec; break; case 'i': case 'd': case 'u': case 'o': case 'x': case 'X': if (c == 'i') c = 'd'; isnumok = 0; if (PyNumber_Check(v)) { PyObject* iobj = NULL; if (PyInt_Check(v) || (PyLong_Check(v))) { iobj = v; Py_INCREF(iobj); } else { iobj = PyNumber_Int(v); if (iobj == NULL) { PyErr_Clear(); iobj = PyNumber_Long(v); } } if (iobj != NULL) { if (PyInt_Check(iobj)) { isnumok = 1; pbuf = formatbuf; // Pyston change: len = formatint(formatbuf /* pbuf */, sizeof(formatbuf), flags, prec, c, iobj); Py_DECREF(iobj); if (len < 0) goto error; sign = 1; } else if (PyLong_Check(iobj)) { int ilen; isnumok = 1; temp = _PyString_FormatLong(iobj, flags, prec, c, &pbuf, &ilen); Py_DECREF(iobj); len = ilen; if (!temp) goto error; sign = 1; } else { Py_DECREF(iobj); } } } if (!isnumok) { PyErr_Format(PyExc_TypeError, "%%%c format: a number is required, " "not %.200s", c, Py_TYPE(v)->tp_name); goto error; } if (flags & F_ZERO) fill = '0'; break; case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': temp = formatfloat(v, flags, prec, c); if (temp == NULL) goto error; pbuf = PyString_AS_STRING(temp); len = PyString_GET_SIZE(temp); sign = 1; if (flags & F_ZERO) fill = '0'; break; case 'c': #ifdef Py_USING_UNICODE if (PyUnicode_Check(v)) { fmt = fmt_start; argidx = argidx_start; goto unicode; } #endif pbuf = formatbuf; // Pyston change: len = formatchar(formatbuf /* was pbuf */, sizeof(formatbuf), v); if (len < 0) goto error; break; default: PyErr_Format(PyExc_ValueError, "unsupported format character '%c' (0x%x) " "at index %zd", c, c, (Py_ssize_t)(fmt - 1 - PyString_AsString(format))); goto error; } if (sign) { if (*pbuf == '-' || *pbuf == '+') { sign = *pbuf++; len--; } else if (flags & F_SIGN) sign = '+'; else if (flags & F_BLANK) sign = ' '; else sign = 0; } if (width < len) width = len; if (rescnt - (sign != 0) < width) { reslen -= rescnt; rescnt = width + fmtcnt + 100; reslen += rescnt; if (reslen < 0) { Py_DECREF(result); Py_XDECREF(temp); return PyErr_NoMemory(); } if (_PyString_Resize(&result, reslen)) { Py_XDECREF(temp); return NULL; } res = PyString_AS_STRING(result) + reslen - rescnt; } if (sign) { if (fill != ' ') *res++ = sign; rescnt--; if (width > len) width--; } if ((flags & F_ALT) && (c == 'x' || c == 'X')) { assert(pbuf[0] == '0'); assert(pbuf[1] == c); if (fill != ' ') { *res++ = *pbuf++; *res++ = *pbuf++; } rescnt -= 2; width -= 2; if (width < 0) width = 0; len -= 2; } if (width > len && !(flags & F_LJUST)) { do { --rescnt; *res++ = fill; } while (--width > len); } if (fill == ' ') { if (sign) *res++ = sign; if ((flags & F_ALT) && (c == 'x' || c == 'X')) { assert(pbuf[0] == '0'); assert(pbuf[1] == c); *res++ = *pbuf++; *res++ = *pbuf++; } } Py_MEMCPY(res, pbuf, len); res += len; rescnt -= len; while (--width >= len) { --rescnt; *res++ = ' '; } if (dict && (argidx < arglen) && c != '%') { PyErr_SetString(PyExc_TypeError, "not all arguments converted during string formatting"); Py_XDECREF(temp); goto error; } Py_XDECREF(temp); } /* '%' */ } /* until end */ if (argidx < arglen && !dict) { PyErr_SetString(PyExc_TypeError, "not all arguments converted during string formatting"); goto error; } if (args_owned) { Py_DECREF(args); } if (_PyString_Resize(&result, reslen - rescnt)) return NULL; return result; #ifdef Py_USING_UNICODE unicode: if (args_owned) { Py_DECREF(args); args_owned = 0; } /* Fiddle args right (remove the first argidx arguments) */ if (PyTuple_Check(orig_args) && argidx > 0) { PyObject* v; Py_ssize_t n = PyTuple_GET_SIZE(orig_args) - argidx; v = PyTuple_New(n); if (v == NULL) goto error; while (--n >= 0) { PyObject* w = PyTuple_GET_ITEM(orig_args, n + argidx); Py_INCREF(w); PyTuple_SET_ITEM(v, n, w); } args = v; } else { Py_INCREF(orig_args); args = orig_args; } args_owned = 1; /* Take what we have of the result and let the Unicode formatting function format the rest of the input. */ rescnt = res - PyString_AS_STRING(result); if (_PyString_Resize(&result, rescnt)) goto error; fmtcnt = PyString_GET_SIZE(format) - (fmt - PyString_AS_STRING(format)); format = PyUnicode_Decode(fmt, fmtcnt, NULL, NULL); if (format == NULL) goto error; v = PyUnicode_Format(format, args); Py_DECREF(format); if (v == NULL) goto error; /* Paste what we have (result) to what the Unicode formatting function returned (v) and return the result (or error) */ w = PyUnicode_Concat(result, v); Py_DECREF(result); Py_DECREF(v); Py_DECREF(args); return w; #endif /* Py_USING_UNICODE */ error: Py_DECREF(result); if (args_owned) { Py_DECREF(args); } return NULL; } extern "C" Box* strMod(BoxedString* lhs, Box* rhs) { Box* rtn = PyString_Format(lhs, rhs); checkAndThrowCAPIException(); assert(rtn); return rtn; } extern "C" Box* strMul(BoxedString* lhs, Box* rhs) { assert(PyString_Check(lhs)); int n; if (PyInt_Check(rhs)) n = static_cast<BoxedInt*>(rhs)->n; else return NotImplemented; if (n <= 0) return EmptyString; // TODO: use createUninitializedString and getWriteableStringContents int sz = lhs->size(); std::string buf(sz * n, '\0'); for (int i = 0; i < n; i++) { memcpy(&buf[sz * i], lhs->data(), sz); } return boxString(buf); } Box* str_richcompare(Box* lhs, Box* rhs, int op) { assert(PyString_Check(lhs)); // Note: it is somehow about 50% faster to do this check inside the switch // statement, rather than out here. It's functionally equivalent but the // generated assembly is somehow quite better: // if (unlikely(!PyString_Check(rhs))) // return NotImplemented; BoxedString* slhs = static_cast<BoxedString*>(lhs); BoxedString* srhs = static_cast<BoxedString*>(rhs); switch (op) { case Py_EQ: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() == srhs->s()); case Py_NE: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() != srhs->s()); case Py_LT: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() < srhs->s()); case Py_LE: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() <= srhs->s()); case Py_GT: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() > srhs->s()); case Py_GE: if (unlikely(!PyString_Check(rhs))) return NotImplemented; return boxBool(slhs->s() >= srhs->s()); default: llvm_unreachable("invalid op"); } } #define JUST_LEFT 0 #define JUST_RIGHT 1 #define JUST_CENTER 2 static Box* pad(BoxedString* self, Box* width, Box* fillchar, int justType) { assert(width->cls == int_cls); assert(PyString_Check(fillchar)); assert(static_cast<BoxedString*>(fillchar)->size() == 1); int64_t curWidth = self->size(); int64_t targetWidth = static_cast<BoxedInt*>(width)->n; if (curWidth >= targetWidth) { if (self->cls == str_cls) { return self; } else { // If self isn't a string but a subclass of str, then make a new string to return return boxString(self->s()); } } char c = static_cast<BoxedString*>(fillchar)->s()[0]; int padLeft, padRight; int nNeeded = targetWidth - curWidth; switch (justType) { case JUST_LEFT: padLeft = 0; padRight = nNeeded; break; case JUST_RIGHT: padLeft = nNeeded; padRight = 0; break; case JUST_CENTER: padLeft = nNeeded / 2 + (nNeeded & targetWidth & 1); padRight = nNeeded - padLeft; break; default: abort(); } // TODO this is probably slow return boxStringTwine(llvm::Twine(std::string(padLeft, c)) + self->s() + std::string(padRight, c)); } extern "C" Box* strLjust(BoxedString* lhs, Box* width, Box* fillchar) { return pad(lhs, width, fillchar, JUST_LEFT); } extern "C" Box* strRjust(BoxedString* lhs, Box* width, Box* fillchar) { return pad(lhs, width, fillchar, JUST_RIGHT); } extern "C" Box* strCenter(BoxedString* lhs, Box* width, Box* fillchar) { return pad(lhs, width, fillchar, JUST_CENTER); } extern "C" Box* strLen(BoxedString* self) { assert(PyString_Check(self)); return boxInt(self->size()); } extern "C" Box* strStr(BoxedString* self) { assert(PyString_Check(self)); if (self->cls == str_cls) return self; return boxString(self->s()); } static bool _needs_escaping[256] = { true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true }; static char _hex[17] = "0123456789abcdef"; // really only needs to be 16 but clang will complain extern "C" PyObject* PyString_Repr(PyObject* obj, int smartquotes) noexcept { BoxedString* op = (BoxedString*)obj; size_t newsize = 2 + 4 * Py_SIZE(op); PyObject* v; if (newsize > PY_SSIZE_T_MAX || newsize / 4 != Py_SIZE(op)) { PyErr_SetString(PyExc_OverflowError, "string is too large to make repr"); return NULL; } v = PyString_FromStringAndSize((char*)NULL, newsize); if (v == NULL) { return NULL; } else { Py_ssize_t i; char c; char* p; int quote; /* figure out which quote to use; single is preferred */ quote = '\''; if (smartquotes && memchr(op->data(), '\'', Py_SIZE(op)) && !memchr(op->data(), '"', Py_SIZE(op))) quote = '"'; p = PyString_AS_STRING(v); *p++ = quote; for (i = 0; i < Py_SIZE(op); i++) { /* There's at least enough room for a hex escape * and a closing quote. */ assert(newsize - (p - PyString_AS_STRING(v)) >= 5); c = op->data()[i]; if (c == quote || c == '\\') *p++ = '\\', *p++ = c; else if (c == '\t') *p++ = '\\', *p++ = 't'; else if (c == '\n') *p++ = '\\', *p++ = 'n'; else if (c == '\r') *p++ = '\\', *p++ = 'r'; else if (c < ' ' || c >= 0x7f) { /* For performance, we don't want to call * PyOS_snprintf here (extra layers of * function call). */ sprintf(p, "\\x%02x", c & 0xff); p += 4; } else *p++ = c; } assert(newsize - (p - PyString_AS_STRING(v)) >= 1); *p++ = quote; *p = '\0'; if (_PyString_Resize(&v, (p - PyString_AS_STRING(v)))) return NULL; return v; } } extern "C" Box* strRepr(BoxedString* self) { return PyString_Repr(self, 1 /* smartquotes */); } extern "C" Box* str_repr(Box* self) noexcept { return PyString_Repr(self, 1 /* smartquotes */); } /* Unescape a backslash-escaped string. If unicode is non-zero, the string is a u-literal. If recode_encoding is non-zero, the string is UTF-8 encoded and should be re-encoded in the specified encoding. */ extern "C" PyObject* PyString_DecodeEscape(const char* s, Py_ssize_t len, const char* errors, Py_ssize_t unicode, const char* recode_encoding) noexcept { int c; char* p, *buf; const char* end; PyObject* v; Py_ssize_t newlen = recode_encoding ? 4 * len : len; v = PyString_FromStringAndSize((char*)NULL, newlen); if (v == NULL) return NULL; p = buf = PyString_AsString(v); end = s + len; while (s < end) { if (*s != '\\') { non_esc: #ifdef Py_USING_UNICODE if (recode_encoding && (*s & 0x80)) { PyObject* u, *w; char* r; const char* t; Py_ssize_t rn; t = s; /* Decode non-ASCII bytes as UTF-8. */ while (t < end && (*t & 0x80)) t++; u = PyUnicode_DecodeUTF8(s, t - s, errors); if (!u) goto failed; /* Recode them in target encoding. */ w = PyUnicode_AsEncodedString(u, recode_encoding, errors); Py_DECREF(u); if (!w) goto failed; /* Append bytes to output buffer. */ assert(PyString_Check(w)); r = PyString_AS_STRING(w); rn = PyString_GET_SIZE(w); Py_MEMCPY(p, r, rn); p += rn; Py_DECREF(w); s = t; } else { *p++ = *s++; } #else *p++ = *s++; #endif continue; } s++; if (s == end) { PyErr_SetString(PyExc_ValueError, "Trailing \\ in string"); goto failed; } switch (*s++) { /* XXX This assumes ASCII! */ case '\n': break; case '\\': *p++ = '\\'; break; case '\'': *p++ = '\''; break; case '\"': *p++ = '\"'; break; case 'b': *p++ = '\b'; break; case 'f': *p++ = '\014'; break; /* FF */ case 't': *p++ = '\t'; break; case 'n': *p++ = '\n'; break; case 'r': *p++ = '\r'; break; case 'v': *p++ = '\013'; break; /* VT */ case 'a': *p++ = '\007'; break; /* BEL, not classic C */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': c = s[-1] - '0'; if (s < end && '0' <= *s && *s <= '7') { c = (c << 3) + *s++ - '0'; if (s < end && '0' <= *s && *s <= '7') c = (c << 3) + *s++ - '0'; } *p++ = c; break; case 'x': if (s + 1 < end && isxdigit(Py_CHARMASK(s[0])) && isxdigit(Py_CHARMASK(s[1]))) { unsigned int x = 0; c = Py_CHARMASK(*s); s++; if (isdigit(c)) x = c - '0'; else if (islower(c)) x = 10 + c - 'a'; else x = 10 + c - 'A'; x = x << 4; c = Py_CHARMASK(*s); s++; if (isdigit(c)) x += c - '0'; else if (islower(c)) x += 10 + c - 'a'; else x += 10 + c - 'A'; *p++ = x; break; } if (!errors || strcmp(errors, "strict") == 0) { PyErr_SetString(PyExc_ValueError, "invalid \\x escape"); goto failed; } if (strcmp(errors, "replace") == 0) { *p++ = '?'; } else if (strcmp(errors, "ignore") == 0) /* do nothing */; else { PyErr_Format(PyExc_ValueError, "decoding error; " "unknown error handling code: %.400s", errors); goto failed; } /* skip \x */ if (s < end && isxdigit(Py_CHARMASK(s[0]))) s++; /* and a hexdigit */ break; #ifndef Py_USING_UNICODE case 'u': case 'U': case 'N': if (unicode) { PyErr_SetString(PyExc_ValueError, "Unicode escapes not legal " "when Unicode disabled"); goto failed; } #endif default: *p++ = '\\'; s--; goto non_esc; /* an arbitrary number of unescaped UTF-8 bytes may follow. */ } } if (p - buf < newlen) _PyString_Resize(&v, p - buf); /* v is cleared on error */ return v; failed: Py_DECREF(v); return NULL; } extern "C" size_t unicodeHashUnboxed(PyUnicodeObject* self) { Py_ssize_t len; Py_UNICODE* p; long x; #ifdef Py_DEBUG assert(_Py_HashSecret_Initialized); #endif if (self->hash != -1) return self->hash; len = PyUnicode_GET_SIZE(self); /* We make the hash of the empty string be 0, rather than using (prefix ^ suffix), since this slightly obfuscates the hash secret */ if (len == 0) { self->hash = 0; return 0; } p = PyUnicode_AS_UNICODE(self); x = _Py_HashSecret.prefix; x ^= *p << 7; while (--len >= 0) x = (1000003 * x) ^ *p++; x ^= PyUnicode_GET_SIZE(self); x ^= _Py_HashSecret.suffix; if (x == -1) x = -2; self->hash = x; return x; } extern "C" size_t strHashUnboxed(BoxedString* self) { assert(PyString_Check(self)); const char* p; long x; #ifdef Py_DEBUG assert(_Py_HashSecret_Initialized); #endif long len = Py_SIZE(self); /* We make the hash of the empty string be 0, rather than using (prefix ^ suffix), since this slightly obfuscates the hash secret */ if (len == 0) { return 0; } p = self->s().data(); x = _Py_HashSecret.prefix; x ^= *p << 7; while (--len >= 0) x = (1000003 * x) ^ *p++; x ^= Py_SIZE(self); x ^= _Py_HashSecret.suffix; if (x == -1) x = -2; return x; } extern "C" Box* strHash(BoxedString* self) { return boxInt(strHashUnboxed(self)); } size_t str_hash(BoxedString* self) noexcept { return strHashUnboxed(self); } extern "C" Box* strNonzero(BoxedString* self) { ASSERT(PyString_Check(self), "%s", self->cls->tp_name); return boxBool(self->size() != 0); } extern "C" Box* strNew(BoxedClass* cls, Box* obj) { assert(isSubclass(cls, str_cls)); if (cls != str_cls) { Box* tmp = strNew(str_cls, obj); assert(PyString_Check(tmp)); BoxedString* tmp_s = static_cast<BoxedString*>(tmp); return new (cls, tmp_s->size()) BoxedString(tmp_s->s()); } Box* r = PyObject_Str(obj); if (!r) throwCAPIException(); assert(PyString_Check(r)); return r; } extern "C" Box* basestringNew(BoxedClass* cls, Box* args, Box* kwargs) { raiseExcHelper(TypeError, "The basestring type cannot be instantiated"); } Box* _strSlice(BoxedString* self, i64 start, i64 stop, i64 step, i64 length) { assert(PyString_Check(self)); llvm::StringRef s = self->s(); assert(step != 0); if (step > 0) { assert(0 <= start); assert(stop <= s.size()); } else { assert(start < s.size()); assert(-1 <= stop); } assert(length >= 0); if (length == 0) return EmptyString; BoxedString* bs = BoxedString::createUninitializedString(length); copySlice(bs->data(), s.data(), start, step, length); return bs; } static Box* str_slice(Box* o, Py_ssize_t i, Py_ssize_t j) { BoxedString* a = static_cast<BoxedString*>(o); if (i < 0) i = 0; if (j < 0) j = 0; /* Avoid signed/unsigned bug in next line */ if (j > Py_SIZE(a)) j = Py_SIZE(a); if (i == 0 && j == Py_SIZE(a) && PyString_CheckExact(a)) { /* It's the same as a */ Py_INCREF(a); return (PyObject*)a; } if (j < i) j = i; return PyString_FromStringAndSize(a->data() + i, j - i); } // Analoguous to CPython's, used for sq_ slots. static Py_ssize_t str_length(Box* a) { return Py_SIZE(a); } Box* strIsAlpha(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; for (const auto& c : str) { if (!std::isalpha(c)) return False; } return True; } Box* strIsDigit(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; for (const auto& c : str) { if (!std::isdigit(c)) return False; } return True; } Box* strIsAlnum(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; for (const auto& c : str) { if (!std::isalnum(c)) return False; } return True; } Box* strIsLower(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); bool lowered = false; if (str.empty()) return False; for (const auto& c : str) { if (std::isspace(c) || std::isdigit(c)) { continue; } else if (!std::islower(c)) { return False; } else { lowered = true; } } return boxBool(lowered); } Box* strIsUpper(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; bool cased = false; for (const auto& c : str) { if (std::islower(c)) return False; else if (!cased && isupper(c)) cased = true; } return boxBool(cased); } Box* strIsSpace(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; for (const auto& c : str) { if (!std::isspace(c)) return False; } return True; } Box* strIsTitle(BoxedString* self) { assert(PyString_Check(self)); llvm::StringRef str(self->s()); if (str.empty()) return False; if (str.size() == 1) return boxBool(std::isupper(str[0])); bool cased = false, start_of_word = true; for (const auto& c : str) { if (std::isupper(c)) { if (!start_of_word) { return False; } start_of_word = false; cased = true; } else if (std::islower(c)) { if (start_of_word) { return False; } start_of_word = false; cased = true; } else { start_of_word = true; } } return boxBool(cased); } extern "C" PyObject* _PyString_Join(PyObject* sep, PyObject* x) noexcept { RELEASE_ASSERT(PyString_Check(sep), ""); return string_join((PyStringObject*)sep, x); } Box* strReplace(Box* _self, Box* _old, Box* _new, Box** _args) { if (!PyString_Check(_self)) raiseExcHelper(TypeError, "descriptor 'replace' requires a 'str' object but received a '%s'", getTypeName(_self)); BoxedString* self = static_cast<BoxedString*>(_self); #ifdef Py_USING_UNICODE if (PyUnicode_Check(_old) || PyUnicode_Check(_new)) return PyUnicode_Replace((PyObject*)self, _old, _new, -1 /*count*/); #endif if (!PyString_Check(_old)) raiseExcHelper(TypeError, "expected a character buffer object"); BoxedString* old = static_cast<BoxedString*>(_old); if (!PyString_Check(_new)) raiseExcHelper(TypeError, "expected a character buffer object"); BoxedString* new_ = static_cast<BoxedString*>(_new); Box* _maxreplace = _args[0]; if (!PyInt_Check(_maxreplace)) raiseExcHelper(TypeError, "an integer is required"); int max_replaces = static_cast<BoxedInt*>(_maxreplace)->n; size_t start_pos = 0; std::string s = self->s(); bool single_char = old->size() == 1; for (int num_replaced = 0; num_replaced < max_replaces || max_replaces < 0; ++num_replaced) { if (single_char) start_pos = s.find(old->s()[0], start_pos); else start_pos = s.find(old->s(), start_pos); if (start_pos == std::string::npos) break; s.replace(start_pos, old->size(), new_->s()); start_pos += new_->size(); // Handles case where 'to' is a substring of 'from' } return boxString(s); } Box* strPartition(BoxedString* self, BoxedString* sep) { RELEASE_ASSERT(PyString_Check(self), ""); RELEASE_ASSERT(PyString_Check(sep), ""); size_t found_idx; if (sep->size() == 1) found_idx = self->s().find(sep->s()[0]); else found_idx = self->s().find(sep->s()); if (found_idx == std::string::npos) return BoxedTuple::create({ self, EmptyString, EmptyString }); return BoxedTuple::create( { boxString(llvm::StringRef(self->data(), found_idx)), boxString(llvm::StringRef(self->data() + found_idx, sep->size())), boxString(llvm::StringRef(self->data() + found_idx + sep->size(), self->size() - found_idx - sep->size())) }); } Box* strRpartition(BoxedString* self, BoxedString* sep) { RELEASE_ASSERT(PyString_Check(self), ""); RELEASE_ASSERT(PyString_Check(sep), ""); size_t found_idx = self->s().rfind(sep->s()); if (found_idx == std::string::npos) return BoxedTuple::create({ EmptyString, EmptyString, self }); return BoxedTuple::create( { boxString(llvm::StringRef(self->data(), found_idx)), boxString(llvm::StringRef(self->data() + found_idx, sep->size())), boxString(llvm::StringRef(self->data() + found_idx + sep->size(), self->size() - found_idx - sep->size())) }); } extern "C" PyObject* _do_string_format(PyObject* self, PyObject* args, PyObject* kwargs); Box* strFormat(BoxedString* self, BoxedTuple* args, BoxedDict* kwargs) { assert(args->cls == tuple_cls); assert(!kwargs || kwargs->cls == dict_cls); Box* rtn = _do_string_format(self, args, kwargs); checkAndThrowCAPIException(); assert(rtn); return rtn; } Box* strStrip(BoxedString* self, Box* chars) { assert(PyString_Check(self)); auto str = self->s(); if (PyString_Check(chars)) { auto chars_str = static_cast<BoxedString*>(chars)->s(); return boxString(str.trim(chars_str)); } else if (chars->cls == none_cls) { return boxString(str.trim(" \t\n\r\f\v")); } else if (isSubclass(chars->cls, unicode_cls)) { PyObject* uniself = PyUnicode_FromObject((PyObject*)self); PyObject* res; if (uniself == NULL) throwCAPIException(); res = _PyUnicode_XStrip((PyUnicodeObject*)uniself, BOTHSTRIP, chars); if (!res) throwCAPIException(); Py_DECREF(uniself); return res; } else { raiseExcHelper(TypeError, "strip arg must be None, str or unicode"); } } Box* strLStrip(BoxedString* self, Box* chars) { assert(PyString_Check(self)); auto str = self->s(); if (PyString_Check(chars)) { auto chars_str = static_cast<BoxedString*>(chars)->s(); return boxString(str.ltrim(chars_str)); } else if (chars->cls == none_cls) { return boxString(str.ltrim(" \t\n\r\f\v")); } else if (isSubclass(chars->cls, unicode_cls)) { PyObject* uniself = PyUnicode_FromObject((PyObject*)self); PyObject* res; if (uniself == NULL) throwCAPIException(); res = _PyUnicode_XStrip((PyUnicodeObject*)uniself, LEFTSTRIP, chars); if (!res) throwCAPIException(); Py_DECREF(uniself); return res; } else { raiseExcHelper(TypeError, "lstrip arg must be None, str or unicode"); } } Box* strRStrip(BoxedString* self, Box* chars) { assert(PyString_Check(self)); auto str = self->s(); if (PyString_Check(chars)) { auto chars_str = static_cast<BoxedString*>(chars)->s(); return boxString(str.rtrim(chars_str)); } else if (chars->cls == none_cls) { return boxString(str.rtrim(" \t\n\r\f\v")); } else if (isSubclass(chars->cls, unicode_cls)) { PyObject* uniself = PyUnicode_FromObject((PyObject*)self); PyObject* res; if (uniself == NULL) throwCAPIException(); res = _PyUnicode_XStrip((PyUnicodeObject*)uniself, RIGHTSTRIP, chars); if (!res) throwCAPIException(); Py_DECREF(uniself); return res; } else { raiseExcHelper(TypeError, "rstrip arg must be None, str or unicode"); } } Box* strCapitalize(BoxedString* self) { assert(PyString_Check(self)); std::string s(self->s()); for (auto& i : s) { i = std::tolower(i); } if (!s.empty()) { s[0] = std::toupper(s[0]); } return boxString(s); } Box* strTitle(BoxedString* self) { assert(PyString_Check(self)); std::string s(self->s()); bool start_of_word = false; for (auto& i : s) { if (std::islower(i)) { if (!start_of_word) { i = std::toupper(i); } start_of_word = true; } else if (std::isupper(i)) { if (start_of_word) { i = std::tolower(i); } start_of_word = true; } else { start_of_word = false; } } return boxString(s); } Box* strTranslate(BoxedString* self, BoxedString* table, BoxedString* delete_chars) { if (!PyString_Check(self)) raiseExcHelper(TypeError, "descriptor 'translate' requires a 'str' object but received a '%s'", getTypeName(self)); std::unordered_set<char> delete_set; if (delete_chars) { if (!PyString_Check(delete_chars)) raiseExcHelper(TypeError, "expected a character buffer object"); delete_set.insert(delete_chars->s().begin(), delete_chars->s().end()); } bool have_table = table != None; if (have_table) { if (!PyString_Check(table)) raiseExcHelper(TypeError, "expected a character buffer object"); if (table->size() != 256) raiseExcHelper(ValueError, "translation table must be 256 characters long"); } std::string str; for (const char c : self->s()) { if (!delete_set.count(c)) str.append(1, have_table ? table->s()[(unsigned char)c] : c); } return boxString(str); } Box* strLower(BoxedString* self) { assert(PyString_Check(self)); BoxedString* rtn = new (self->size()) BoxedString(self->s()); for (int i = 0; i < rtn->size(); i++) rtn->data()[i] = std::tolower(rtn->data()[i]); return rtn; } Box* strUpper(BoxedString* self) { assert(PyString_Check(self)); BoxedString* rtn = new (self->size()) BoxedString(self->s()); for (int i = 0; i < rtn->size(); i++) rtn->data()[i] = std::toupper(rtn->data()[i]); return rtn; } Box* strSwapcase(BoxedString* self) { assert(PyString_Check(self)); BoxedString* rtn = new (self->size()) BoxedString(self->s()); for (int i = 0; i < rtn->size(); i++) { char c = rtn->data()[i]; if (std::islower(c)) rtn->data()[i] = std::toupper(c); else if (std::isupper(c)) rtn->data()[i] = std::tolower(c); } return rtn; } static inline int string_contains_shared(BoxedString* self, Box* elt) { assert(PyString_Check(self)); if (PyUnicode_Check(elt)) { int r = PyUnicode_Contains(self, elt); if (r < 0) throwCAPIException(); return r; } if (!PyString_Check(elt)) raiseExcHelper(TypeError, "'in <string>' requires string as left operand, not %s", getTypeName(elt)); BoxedString* sub = static_cast<BoxedString*>(elt); size_t found_idx; if (sub->size() == 1) // Call the much-faster single-character find(): found_idx = self->s().find(sub->s()[0]); else found_idx = self->s().find(sub->s()); return (found_idx != std::string::npos); } // Analoguous to CPython's, used for sq_ slots. static int string_contains(PyObject* str_obj, PyObject* sub_obj) { return string_contains_shared((BoxedString*)str_obj, sub_obj); } Box* strContains(BoxedString* self, Box* elt) { return boxBool(string_contains_shared(self, elt)); } // compares (a+a_pos, len) with (str) // if len == npos, compare to the end of a static int compareStringRefs(llvm::StringRef a, size_t a_pos, size_t len, llvm::StringRef str) { if (len == llvm::StringRef::npos) len = a.size() - a_pos; if (a_pos + len > a.size()) throw std::out_of_range("pos+len out of range"); return llvm::StringRef(a.data() + a_pos, len).compare(str); } extern "C" int _PyString_Eq(PyObject* o1, PyObject* o2) noexcept { assert(PyString_Check(o1)); assert(PyString_Check(o2)); BoxedString* a = (BoxedString*)o1; BoxedString* b = (BoxedString*)o2; return a->s() == b->s(); } Box* strStartswith(BoxedString* self, Box* elt, Box* start, Box** _args) { Box* end = _args[0]; if (!PyString_Check(self)) raiseExcHelper(TypeError, "descriptor 'startswith' requires a 'str' object but received a '%s'", getTypeName(self)); Py_ssize_t istart = 0, iend = PY_SSIZE_T_MAX; if (start) { int r = _PyEval_SliceIndex(start, &istart); if (!r) throwCAPIException(); } if (end) { int r = _PyEval_SliceIndex(end, &iend); if (!r) throwCAPIException(); } if (PyTuple_Check(elt)) { for (auto e : *static_cast<BoxedTuple*>(elt)) { auto b = strStartswith(self, e, start, _args); assert(b->cls == bool_cls); if (b == True) return True; } return False; } if (isSubclass(elt->cls, unicode_cls)) { int r = PyUnicode_Tailmatch(self, elt, istart, iend, -1); if (r < 0) throwCAPIException(); assert(r == 0 || r == 1); return boxBool(r); } if (!PyString_Check(elt)) raiseExcHelper(TypeError, "expected a character buffer object"); BoxedString* sub = static_cast<BoxedString*>(elt); Py_ssize_t n = self->size(); iend = std::min(iend, n); if (iend < 0) iend += n; if (iend < 0) iend = 0; if (istart < 0) istart += n; if (istart < 0) istart = 0; Py_ssize_t compare_len = iend - istart; if (compare_len < 0) return False; if (sub->size() > compare_len) return False; return boxBool(compareStringRefs(self->s(), istart, sub->size(), sub->s()) == 0); } Box* strEndswith(BoxedString* self, Box* elt, Box* start, Box** _args) { Box* end = _args[0]; if (!PyString_Check(self)) raiseExcHelper(TypeError, "descriptor 'endswith' requires a 'str' object but received a '%s'", getTypeName(self)); Py_ssize_t istart = 0, iend = PY_SSIZE_T_MAX; if (start) { int r = _PyEval_SliceIndex(start, &istart); if (!r) throwCAPIException(); } if (end) { int r = _PyEval_SliceIndex(end, &iend); if (!r) throwCAPIException(); } if (isSubclass(elt->cls, unicode_cls)) { int r = PyUnicode_Tailmatch(self, elt, istart, iend, +1); if (r < 0) throwCAPIException(); assert(r == 0 || r == 1); return boxBool(r); } if (PyTuple_Check(elt)) { for (auto e : *static_cast<BoxedTuple*>(elt)) { auto b = strEndswith(self, e, start, _args); assert(b->cls == bool_cls); if (b == True) return True; } return False; } if (!PyString_Check(elt)) raiseExcHelper(TypeError, "expected a character buffer object"); BoxedString* sub = static_cast<BoxedString*>(elt); Py_ssize_t n = self->size(); iend = std::min(iend, n); if (iend < 0) iend += n; if (iend < 0) iend = 0; if (istart < 0) istart += n; if (istart < 0) istart = 0; Py_ssize_t compare_len = iend - istart; if (compare_len < 0) return False; if (sub->size() > compare_len) return False; // XXX: this line is the only difference between startswith and endswith: istart += compare_len - sub->size(); return boxBool(compareStringRefs(self->s(), istart, sub->size(), sub->s()) == 0); } Box* strDecode(BoxedString* self, Box* encoding, Box* error) { if (!PyString_Check(self)) raiseExcHelper(TypeError, "descriptor 'decode' requires a 'str' object but received a '%s'", getTypeName(self)); BoxedString* encoding_str = (BoxedString*)encoding; BoxedString* error_str = (BoxedString*)error; if (encoding_str && encoding_str->cls == unicode_cls) encoding_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(encoding_str, NULL); if (encoding_str && !PyString_Check(encoding_str)) raiseExcHelper(TypeError, "decode() argument 1 must be string, not '%s'", getTypeName(encoding_str)); if (error_str && error_str->cls == unicode_cls) error_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(error_str, NULL); if (error_str && !PyString_Check(error_str)) raiseExcHelper(TypeError, "decode() argument 2 must be string, not '%s'", getTypeName(error_str)); Box* result = PyString_AsDecodedObject(self, encoding_str ? encoding_str->data() : NULL, error_str ? error_str->data() : NULL); checkAndThrowCAPIException(); return result; } Box* strEncode(BoxedString* self, Box* encoding, Box* error) { if (!PyString_Check(self)) raiseExcHelper(TypeError, "descriptor 'encode' requires a 'str' object but received a '%s'", getTypeName(self)); BoxedString* encoding_str = (BoxedString*)encoding; BoxedString* error_str = (BoxedString*)error; if (encoding_str && encoding_str->cls == unicode_cls) encoding_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(encoding_str, NULL); if (encoding_str && !PyString_Check(encoding_str)) raiseExcHelper(TypeError, "encode() argument 1 must be string, not '%s'", getTypeName(encoding_str)); if (error_str && error_str->cls == unicode_cls) error_str = (BoxedString*)_PyUnicode_AsDefaultEncodedString(error_str, NULL); if (error_str && !PyString_Check(error_str)) raiseExcHelper(TypeError, "encode() argument 2 must be string, not '%s'", getTypeName(error_str)); Box* result = PyString_AsEncodedObject(self, encoding_str ? encoding_str->data() : PyUnicode_GetDefaultEncoding(), error_str ? error_str->data() : NULL); checkAndThrowCAPIException(); return result; } static PyObject* string_item(PyStringObject* self, register Py_ssize_t i) { BoxedString* boxedString = (BoxedString*)self; if (i < 0 || i >= boxedString->size()) { raiseExcHelper(IndexError, "string index out of range"); } char c = boxedString->s()[i]; return characters[c & UCHAR_MAX]; } template <ExceptionStyle S> Box* strGetitem(BoxedString* self, Box* slice) { if (S == CAPI) { try { return strGetitem<CXX>(self, slice); } catch (ExcInfo e) { setCAPIException(e); return NULL; } } assert(PyString_Check(self)); if (PyIndex_Check(slice)) { Py_ssize_t n = PyNumber_AsSsize_t(slice, PyExc_IndexError); if (n == -1 && PyErr_Occurred()) throwCAPIException(); int size = self->size(); if (n < 0) n = size + n; if (n < 0 || n >= size) { raiseExcHelper(IndexError, "string index out of range"); } char c = self->s()[n]; return characters[c & UCHAR_MAX]; } else if (slice->cls == slice_cls) { BoxedSlice* sslice = static_cast<BoxedSlice*>(slice); i64 start, stop, step, length; parseSlice(sslice, self->size(), &start, &stop, &step, &length); return _strSlice(self, start, stop, step, length); } else { raiseExcHelper(TypeError, "string indices must be integers, not %s", getTypeName(slice)); } } extern "C" Box* strGetslice(BoxedString* self, Box* boxedStart, Box* boxedStop) { assert(PyString_Check(self)); i64 start, stop; sliceIndex(boxedStart, &start); sliceIndex(boxedStop, &stop); boundSliceWithLength(&start, &stop, start, stop, self->s().size()); return _strSlice(self, start, stop, 1, stop - start); } // TODO it looks like strings don't have their own iterators, but instead // rely on the sequence iteration protocol. // Should probably implement that, and maybe once that's implemented get // rid of the striterator class? BoxedClass* str_iterator_cls = NULL; class BoxedStringIterator : public Box { public: BoxedString* s; std::string::const_iterator it, end; BoxedStringIterator(BoxedString* s) : s(s), it(s->s().begin()), end(s->s().end()) {} DEFAULT_CLASS(str_iterator_cls); static bool hasnextUnboxed(BoxedStringIterator* self) { assert(self->cls == str_iterator_cls); return self->it != self->end; } static Box* hasnext(BoxedStringIterator* self) { assert(self->cls == str_iterator_cls); return boxBool(self->it != self->end); } static Box* iter(BoxedStringIterator* self) { assert(self->cls == str_iterator_cls); return self; } static Box* next(BoxedStringIterator* self) { assert(self->cls == str_iterator_cls); assert(hasnextUnboxed(self)); char c = *self->it; ++self->it; return characters[c & UCHAR_MAX]; } static void gcHandler(GCVisitor* v, Box* b) { Box::gcHandler(v, b); BoxedStringIterator* it = (BoxedStringIterator*)b; v->visit(it->s); } }; Box* strIter(BoxedString* self) noexcept { assert(PyString_Check(self)); return new BoxedStringIterator(self); } extern "C" PyObject* PyString_FromString(const char* s) noexcept { return boxString(s); } extern "C" int PyString_AsStringAndSize(register PyObject* obj, register char** s, register Py_ssize_t* len) noexcept { if (s == NULL) { PyErr_BadInternalCall(); return -1; } if (!PyString_Check(obj)) { #ifdef Py_USING_UNICODE if (PyUnicode_Check(obj)) { obj = _PyUnicode_AsDefaultEncodedString(obj, NULL); if (obj == NULL) return -1; } else #endif { PyErr_Format(PyExc_TypeError, "expected string or Unicode object, " "%.200s found", Py_TYPE(obj)->tp_name); return -1; } } *s = PyString_AS_STRING(obj); if (len != NULL) *len = PyString_GET_SIZE(obj); else if (strlen(*s) != (size_t)PyString_GET_SIZE(obj)) { PyErr_SetString(PyExc_TypeError, "expected string without null bytes"); return -1; } return 0; } extern "C" PyObject* PyString_FromStringAndSize(const char* s, ssize_t n) noexcept { if (s == NULL) return BoxedString::createUninitializedString(n); return boxString(llvm::StringRef(s, n)); } static /*const*/ char* string_getbuffer(register PyObject* op) noexcept { char* s; Py_ssize_t len; if (PyString_AsStringAndSize(op, &s, &len)) return NULL; return s; } extern "C" char* PyString_AsString(PyObject* o) noexcept { if (!PyString_Check(o)) return string_getbuffer(o); BoxedString* s = static_cast<BoxedString*>(o); return s->getWriteableStringContents(); } extern "C" Py_ssize_t PyString_Size(PyObject* op) noexcept { if (PyString_Check(op)) return static_cast<BoxedString*>(op)->size(); char* _s; Py_ssize_t len; if (PyString_AsStringAndSize(op, &_s, &len)) return -1; return len; } extern "C" int _PyString_Resize(PyObject** pv, Py_ssize_t newsize) noexcept { // This is only allowed to be called when there is only one user of the string (ie a refcount of 1 in CPython) assert(pv); assert(PyString_Check(*pv)); BoxedString* s = static_cast<BoxedString*>(*pv); if (newsize == s->size()) return 0; if (PyString_CHECK_INTERNED(s)) { *pv = 0; return -1; } if (newsize < s->size()) { // XXX resize the box (by reallocating) smaller if it makes sense s->ob_size = newsize; s->data()[newsize] = 0; return 0; } BoxedString* resized; if (s->cls == str_cls) resized = new (newsize) BoxedString(newsize, 0); // we need an uninitialized string, but this will memset else resized = new (s->cls, newsize) BoxedString(newsize, 0); // we need an uninitialized string, but this will memset memmove(resized->data(), s->data(), s->size()); *pv = resized; return 0; } extern "C" void PyString_Concat(register PyObject** pv, register PyObject* w) noexcept { try { if (*pv == NULL) return; if (w == NULL || !PyString_Check(*pv)) { *pv = NULL; return; } *pv = strAdd((BoxedString*)*pv, w); } catch (ExcInfo e) { setCAPIException(e); *pv = NULL; } } extern "C" void PyString_ConcatAndDel(register PyObject** pv, register PyObject* w) noexcept { PyString_Concat(pv, w); } static PyObject* string_expandtabs(PyStringObject* self, PyObject* args) noexcept { const char* e, *p, *qe; char* q; Py_ssize_t i, j, incr; PyObject* u; int tabsize = 8; if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) return NULL; /* First pass: determine size of output string */ i = 0; /* chars up to and including most recent \n or \r */ j = 0; /* chars since most recent \n or \r (use in tab calculations) */ e = PyString_AS_STRING(self) + PyString_GET_SIZE(self); /* end of input */ for (p = PyString_AS_STRING(self); p < e; p++) { if (*p == '\t') { if (tabsize > 0) { incr = tabsize - (j % tabsize); if (j > PY_SSIZE_T_MAX - incr) goto overflow1; j += incr; } } else { if (j > PY_SSIZE_T_MAX - 1) goto overflow1; j++; if (*p == '\n' || *p == '\r') { if (i > PY_SSIZE_T_MAX - j) goto overflow1; i += j; j = 0; } } } if (i > PY_SSIZE_T_MAX - j) goto overflow1; /* Second pass: create output string and fill it */ u = PyString_FromStringAndSize(NULL, i + j); if (!u) return NULL; j = 0; /* same as in first pass */ q = PyString_AS_STRING(u); /* next output char */ qe = PyString_AS_STRING(u) + PyString_GET_SIZE(u); /* end of output */ for (p = PyString_AS_STRING(self); p < e; p++) { if (*p == '\t') { if (tabsize > 0) { i = tabsize - (j % tabsize); j += i; while (i--) { if (q >= qe) goto overflow2; *q++ = ' '; } } } else { if (q >= qe) goto overflow2; *q++ = *p; j++; if (*p == '\n' || *p == '\r') j = 0; } } return u; overflow2: Py_DECREF(u); overflow1: PyErr_SetString(PyExc_OverflowError, "new string is too long"); return NULL; } static PyObject* string_zfill(PyObject* self, PyObject* args) { Py_ssize_t fill; PyObject* s; char* p; Py_ssize_t width; if (!PyArg_ParseTuple(args, "n:zfill", &width)) return NULL; if (PyString_GET_SIZE(self) >= width) { if (PyString_CheckExact(self)) { Py_INCREF(self); return (PyObject*)self; } else return PyString_FromStringAndSize(PyString_AS_STRING(self), PyString_GET_SIZE(self)); } fill = width - PyString_GET_SIZE(self); // Pyston change: // s = pad(self, fill, 0, '0'); s = pad((BoxedString*)self, boxInt(width), characters['0' & UCHAR_MAX], JUST_RIGHT); if (s == NULL) return NULL; p = PyString_AS_STRING(s); if (p[fill] == '+' || p[fill] == '-') { /* move sign to beginning of string */ p[0] = p[fill]; p[fill] = '0'; } return (PyObject*)s; } static int string_print(PyObject* _op, FILE* fp, int flags) noexcept { BoxedString* op = (BoxedString*)_op; Py_ssize_t i, str_len; char c; int quote; /* XXX Ought to check for interrupts when writing long strings */ if (!PyString_CheckExact(op)) { int ret; /* A str subclass may have its own __str__ method. */ op = (BoxedString*)PyObject_Str((PyObject*)op); if (op == NULL) return -1; ret = string_print(op, fp, flags); Py_DECREF(op); return ret; } if (flags & Py_PRINT_RAW) { // Pyston change // char *data = op->ob_sval; // Py_ssize_t size = Py_SIZE(op); const char* data = op->data(); Py_ssize_t size = op->size(); Py_BEGIN_ALLOW_THREADS while (size > INT_MAX) { /* Very long strings cannot be written atomically. * But don't write exactly INT_MAX bytes at a time * to avoid memory aligment issues. */ const int chunk_size = INT_MAX & ~0x3FFF; fwrite(data, 1, chunk_size, fp); data += chunk_size; size -= chunk_size; } #ifdef __VMS if (size) fwrite(data, (size_t)size, 1, fp); #else fwrite(data, 1, (size_t)size, fp); #endif Py_END_ALLOW_THREADS return 0; } /* figure out which quote to use; single is preferred */ quote = '\''; // Pyston change // if (memchr(op->ob_sval, '\'', Py_SIZE(op)) && !memchr(op->ob_sval, '"', Py_SIZE(op))) if (memchr(op->data(), '\'', Py_SIZE(op)) && !memchr(op->data(), '"', Py_SIZE(op))) quote = '"'; // Pyston change // str_len = Py_SIZE(op); str_len = op->size(); Py_BEGIN_ALLOW_THREADS fputc(quote, fp); for (i = 0; i < str_len; i++) { /* Since strings are immutable and the caller should have a reference, accessing the interal buffer should not be an issue with the GIL released. */ // Pyston change: // c = op->ob_sval[i]; c = op->s()[i]; if (c == quote || c == '\\') fprintf(fp, "\\%c", c); else if (c == '\t') fprintf(fp, "\\t"); else if (c == '\n') fprintf(fp, "\\n"); else if (c == '\r') fprintf(fp, "\\r"); else if (c < ' ' || c >= 0x7f) fprintf(fp, "\\x%02x", c & 0xff); else fputc(c, fp); } fputc(quote, fp); Py_END_ALLOW_THREADS return 0; } static Py_ssize_t string_buffer_getreadbuf(PyObject* self, Py_ssize_t index, const void** ptr) noexcept { RELEASE_ASSERT(index == 0, ""); // I think maybe this can just be a non-release assert? shouldn't be able to call this with // the wrong type RELEASE_ASSERT(PyString_Check(self), ""); auto s = static_cast<BoxedString*>(self); *ptr = s->data(); return s->size(); } static Py_ssize_t string_buffer_getsegcount(PyObject* o, Py_ssize_t* lenp) noexcept { RELEASE_ASSERT(lenp == NULL, ""); RELEASE_ASSERT(PyString_Check(o), ""); return 1; } static Py_ssize_t string_buffer_getcharbuf(PyStringObject* self, Py_ssize_t index, const char** ptr) { if (index != 0) { PyErr_SetString(PyExc_SystemError, "accessing non-existent string segment"); return -1; } return string_buffer_getreadbuf((PyObject*)self, index, (const void**)ptr); } static int string_buffer_getbuffer(BoxedString* self, Py_buffer* view, int flags) noexcept { assert(PyString_Check(self)); return PyBuffer_FillInfo(view, (PyObject*)self, self->data(), self->size(), 1, flags); } static PyBufferProcs string_as_buffer = { (readbufferproc)string_buffer_getreadbuf, // comments are the only way I've found of (writebufferproc)NULL, // forcing clang-format to break these onto multiple lines (segcountproc)string_buffer_getsegcount, // (charbufferproc)string_buffer_getcharbuf, // (getbufferproc)string_buffer_getbuffer, // (releasebufferproc)NULL, }; static PyMethodDef string_methods[] = { { "count", (PyCFunction)string_count, METH_O3 | METH_D2, NULL }, { "join", (PyCFunction)string_join, METH_O, NULL }, { "split", (PyCFunction)string_split, METH_VARARGS, NULL }, { "rsplit", (PyCFunction)string_rsplit, METH_VARARGS, NULL }, { "find", (PyCFunction)string_find, METH_VARARGS, NULL }, { "index", (PyCFunction)string_index, METH_VARARGS, NULL }, { "rindex", (PyCFunction)string_rindex, METH_VARARGS, NULL }, { "rfind", (PyCFunction)string_rfind, METH_VARARGS, NULL }, { "expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS, NULL }, { "splitlines", (PyCFunction)string_splitlines, METH_VARARGS, NULL }, { "zfill", (PyCFunction)string_zfill, METH_VARARGS, NULL }, { "__format__", (PyCFunction)string__format__, METH_VARARGS, NULL }, }; void setupStr() { str_cls->tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER; str_iterator_cls = BoxedHeapClass::create(type_cls, object_cls, &BoxedStringIterator::gcHandler, 0, 0, sizeof(BoxedStringIterator), false, "striterator"); str_iterator_cls->giveAttr("__hasnext__", new BoxedFunction(boxRTFunction((void*)BoxedStringIterator::hasnext, BOXED_BOOL, 1))); str_iterator_cls->giveAttr("__iter__", new BoxedFunction(boxRTFunction((void*)BoxedStringIterator::iter, UNKNOWN, 1))); str_iterator_cls->giveAttr("next", new BoxedFunction(boxRTFunction((void*)BoxedStringIterator::next, STR, 1))); str_iterator_cls->freeze(); str_iterator_cls->tpp_hasnext = (BoxedClass::pyston_inquiry)BoxedStringIterator::hasnextUnboxed; str_cls->tp_as_buffer = &string_as_buffer; str_cls->tp_print = string_print; str_cls->giveAttr("__len__", new BoxedFunction(boxRTFunction((void*)strLen, BOXED_INT, 1))); str_cls->giveAttr("__str__", new BoxedFunction(boxRTFunction((void*)strStr, STR, 1))); str_cls->giveAttr("__repr__", new BoxedFunction(boxRTFunction((void*)strRepr, STR, 1))); str_cls->giveAttr("__hash__", new BoxedFunction(boxRTFunction((void*)strHash, UNKNOWN, 1))); str_cls->giveAttr("__nonzero__", new BoxedFunction(boxRTFunction((void*)strNonzero, BOXED_BOOL, 1))); str_cls->giveAttr("isalnum", new BoxedFunction(boxRTFunction((void*)strIsAlnum, BOXED_BOOL, 1))); str_cls->giveAttr("isalpha", new BoxedFunction(boxRTFunction((void*)strIsAlpha, BOXED_BOOL, 1))); str_cls->giveAttr("isdigit", new BoxedFunction(boxRTFunction((void*)strIsDigit, BOXED_BOOL, 1))); str_cls->giveAttr("islower", new BoxedFunction(boxRTFunction((void*)strIsLower, BOXED_BOOL, 1))); str_cls->giveAttr("isspace", new BoxedFunction(boxRTFunction((void*)strIsSpace, BOXED_BOOL, 1))); str_cls->giveAttr("istitle", new BoxedFunction(boxRTFunction((void*)strIsTitle, BOXED_BOOL, 1))); str_cls->giveAttr("isupper", new BoxedFunction(boxRTFunction((void*)strIsUpper, BOXED_BOOL, 1))); str_cls->giveAttr("decode", new BoxedFunction(boxRTFunction((void*)strDecode, UNKNOWN, 3, 2, false, false), { 0, 0 })); str_cls->giveAttr("encode", new BoxedFunction(boxRTFunction((void*)strEncode, UNKNOWN, 3, 2, false, false), { 0, 0 })); str_cls->giveAttr("lower", new BoxedFunction(boxRTFunction((void*)strLower, STR, 1))); str_cls->giveAttr("swapcase", new BoxedFunction(boxRTFunction((void*)strSwapcase, STR, 1))); str_cls->giveAttr("upper", new BoxedFunction(boxRTFunction((void*)strUpper, STR, 1))); str_cls->giveAttr("strip", new BoxedFunction(boxRTFunction((void*)strStrip, UNKNOWN, 2, 1, false, false), { None })); str_cls->giveAttr("lstrip", new BoxedFunction(boxRTFunction((void*)strLStrip, UNKNOWN, 2, 1, false, false), { None })); str_cls->giveAttr("rstrip", new BoxedFunction(boxRTFunction((void*)strRStrip, UNKNOWN, 2, 1, false, false), { None })); str_cls->giveAttr("capitalize", new BoxedFunction(boxRTFunction((void*)strCapitalize, STR, 1))); str_cls->giveAttr("title", new BoxedFunction(boxRTFunction((void*)strTitle, STR, 1))); str_cls->giveAttr("translate", new BoxedFunction(boxRTFunction((void*)strTranslate, STR, 3, 1, false, false), { NULL })); str_cls->giveAttr("__contains__", new BoxedFunction(boxRTFunction((void*)strContains, BOXED_BOOL, 2))); str_cls->giveAttr("startswith", new BoxedFunction(boxRTFunction((void*)strStartswith, BOXED_BOOL, 4, 2, 0, 0), { NULL, NULL })); str_cls->giveAttr("endswith", new BoxedFunction(boxRTFunction((void*)strEndswith, BOXED_BOOL, 4, 2, 0, 0), { NULL, NULL })); str_cls->giveAttr("partition", new BoxedFunction(boxRTFunction((void*)strPartition, UNKNOWN, 2))); str_cls->giveAttr("rpartition", new BoxedFunction(boxRTFunction((void*)strRpartition, UNKNOWN, 2))); str_cls->giveAttr("format", new BoxedFunction(boxRTFunction((void*)strFormat, UNKNOWN, 1, 0, true, true))); str_cls->giveAttr("__add__", new BoxedFunction(boxRTFunction((void*)strAdd, UNKNOWN, 2))); str_cls->giveAttr("__mod__", new BoxedFunction(boxRTFunction((void*)strMod, UNKNOWN, 2))); str_cls->giveAttr("__mul__", new BoxedFunction(boxRTFunction((void*)strMul, UNKNOWN, 2))); // TODO not sure if this is right in all cases: str_cls->giveAttr("__rmul__", new BoxedFunction(boxRTFunction((void*)strMul, UNKNOWN, 2))); str_cls->tp_richcompare = str_richcompare; BoxedString* spaceChar = characters[' ' & UCHAR_MAX]; assert(spaceChar); str_cls->giveAttr("ljust", new BoxedFunction(boxRTFunction((void*)strLjust, UNKNOWN, 3, 1, false, false), { spaceChar })); str_cls->giveAttr("rjust", new BoxedFunction(boxRTFunction((void*)strRjust, UNKNOWN, 3, 1, false, false), { spaceChar })); str_cls->giveAttr("center", new BoxedFunction(boxRTFunction((void*)strCenter, UNKNOWN, 3, 1, false, false), { spaceChar })); auto str_getitem = boxRTFunction((void*)strGetitem<CXX>, STR, 2, ParamNames::empty(), CXX); addRTFunction(str_getitem, (void*)strGetitem<CAPI>, STR, CAPI); str_cls->giveAttr("__getitem__", new BoxedFunction(str_getitem)); str_cls->giveAttr("__getslice__", new BoxedFunction(boxRTFunction((void*)strGetslice, STR, 3))); str_cls->giveAttr("__iter__", new BoxedFunction(boxRTFunction((void*)strIter, typeFromClass(str_iterator_cls), 1))); str_cls->giveAttr("replace", new BoxedFunction(boxRTFunction((void*)strReplace, UNKNOWN, 4, 1, false, false), { boxInt(-1) })); for (auto& md : string_methods) { str_cls->giveAttr(md.ml_name, new BoxedMethodDescriptor(&md, str_cls)); } str_cls->giveAttr("__new__", new BoxedFunction(boxRTFunction((void*)strNew, UNKNOWN, 2, 1, false, false), { EmptyString })); add_operators(str_cls); str_cls->freeze(); str_cls->tp_repr = str_repr; str_cls->tp_iter = (decltype(str_cls->tp_iter))strIter; str_cls->tp_hash = (hashfunc)str_hash; str_cls->tp_as_sequence->sq_length = str_length; str_cls->tp_as_sequence->sq_item = (ssizeargfunc)string_item; str_cls->tp_as_sequence->sq_slice = str_slice; str_cls->tp_as_sequence->sq_contains = (objobjproc)string_contains; basestring_cls->giveAttr("__doc__", boxString("Type basestring cannot be instantiated; it is the base for str and unicode.")); basestring_cls->giveAttr("__new__", new BoxedFunction(boxRTFunction((void*)basestringNew, UNKNOWN, 1, 0, true, true))); basestring_cls->freeze(); } void teardownStr() { } }