- Fixed some refcount bugs.

- Implemented stemming in a simpler way. - Made checkSynword() easier to read. - Used PyList_GetItem() to do bounds checking in Splitter_item(). - Made Splitter_indexes slightly faster by keeping a local copy of the length. - splitUnicodeString() now returns -1 on error. - Made splitUnicodeString() easier to read. - prepareString() performs a copy the standard way.

- Fixed some refcount bugs.
- Implemented stemming in a simpler way. - Made checkSynword() easier to read. - Used PyList_GetItem() to do bounds checking in Splitter_item(). - Made Splitter_indexes slightly faster by keeping a local copy of the length. - splitUnicodeString() now returns -1 on error. - Made splitUnicodeString() easier to read. - prepareString() performs a copy the standard way.
c7b741b4 · Shane Hathaway · dad261b1 · c7b741b4
Commit c7b741b4 authored Oct 19, 2001 by Shane Hathaway
Show whitespace changes
Inline Side-by-side

Showing with 56 additions and 89 deletions

lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c .../TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c +56 -89

No files found.
--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
 #include "Python.h"
 #define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
 typedef struct
 {
    PyObject_HEAD
@@ -12,19 +17,18 @@ Splitter;
 static
 PyUnicodeObject *prepareString(PyUnicodeObject *o);
-static PyObject * checkSynword(Splitter *self,PyObject *word)
+static PyObject *checkSynword(Splitter *self, PyObject *word)
 {
+    /* Always returns a borrowed reference */
    PyObject *value;
-    PyObject *res;
    if (self->synstop) {
        value = PyDict_GetItem(self->synstop,word);
-        if (value) {
+        if (value != NULL) {
-            res = value;
+          return value;
-        } else res = word;
+        }
-    } res = word;
+    }
+    return word;
-    return res;
 }
 static void
@@ -60,16 +64,9 @@ Splitter_repeat(Splitter *self, long n)
 static PyObject *
 Splitter_item(Splitter *self, int i)
 {
-    PyObject *item=NULL;
+  PyObject *item;
+  item = PyList_GetItem(self->list, i);
-    if (i >= PyList_Size(self->list)) {
+  Py_XINCREF(item);  /* Promote borrowed ref unless exception */
-        PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
-        return NULL;
-    }
-    item=PyList_GET_ITEM(self->list , i);
-    Py_INCREF(item);
  return item;
 }
@@ -77,19 +74,19 @@ Splitter_item(Splitter *self, int i)
 static PyObject *
 Splitter_indexes(Splitter *self, PyObject *args)
 {
-    int i=0;
+    int i=0, size;
    PyObject *word=NULL,*item=NULL,*r=NULL,*index=NULL;
    if (! (PyArg_ParseTuple(args,"O",&word))) return NULL;
    if (! (r=PyList_New(0))) return NULL;
-    for (i=0;i<PyList_Size(self->list);i++) {
+    size = PyList_Size(self->list);
+    for (i=0;i<size;i++) {
        item=PyList_GET_ITEM(self->list,i);
        if (PyUnicode_Compare(word,item)==0) {
            index=PyInt_FromLong(i);
            if(!index) return NULL;
-            Py_INCREF(item);
            PyList_Append(r,index);
        }
    }
@@ -129,7 +126,7 @@ static struct PyMethodDef Splitter_methods[] =
        },
        { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
-          "indexes(word) -- Return al list of the indexes of word in the sequence",
+          "indexes(word) -- Return a list of the indexes of word in the sequence",
        },
        { NULL, NULL }		/* sentinel */
    };
@@ -181,16 +178,15 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
    int i=0;
    int start=0;
-    if (! (doc1 = prepareString(doc))) {
+    doc1 = prepareString(doc);
+    if (doc1 == NULL)
-        return 0;
+      return -1;
-    }
    s=doc1->str;
    self->list = PyList_New(0);
-    do {
+    for (i = 0; i < len; s++, i++) {
        register Py_UNICODE ch;
        ch = *s;
@@ -208,66 +204,38 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
            if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
                inside_word = 0;
-                word = PySequence_GetSlice((PyObject *)doc,start,i);
+                word = PySequence_GetSlice((PyObject *)doc1,start,
-                if (word==NULL) {
-                    Py_DECREF(doc1);
-                    return 0;
-                }
                                           // Stem word
-                if (PyUnicode_GET_SIZE(word)>MAX_WORD) {
+                                           min(i, start + MAX_WORD));
-                    PyObject *tmpword=word;
+                if (word==NULL)
-                    tmpword = PySequence_GetSlice(word,0,MAX_WORD);
+                  goto err;
-                    if (tmpword==NULL) {
-                        Py_DECREF(doc1);
-                        return 0;
-                    }
-                    Py_DECREF(word);
-                    word = tmpword;
-                }
                synword = checkSynword(self,word);
                if (synword != Py_None) {
                  PyList_Append(self->list,synword);
                }
-                Py_DECREF(word);
                start =  0;
 #ifdef DEBUG
                PyObject_Print(word,stdout,0);
                fflush(stdout);
 #endif
+                Py_DECREF(word);
            }
        }
-        s++;
-    } while(++i < len);
-    if (inside_word) {
-        word = PySequence_GetSlice((PyObject *)doc,start,i);
-        if (word==NULL) {
-            Py_DECREF(doc1);
-            return 0;
    }
+    if (inside_word) {
+        word = PySequence_GetSlice((PyObject *)doc1,start,
                                   // Stem word
-        if (PyUnicode_GET_SIZE(word)>MAX_WORD) {
+                                   min(len, start + MAX_WORD));
-            word = PySequence_GetSlice(word,0,MAX_WORD);
+        if (word==NULL)
-            if (word==NULL) {
+          goto err;
-                Py_DECREF(doc1);
-                return 0;
-            }
-        }
        synword = checkSynword(self,word);
        if (synword != Py_None) {
          PyList_Append(self->list,synword);
-        } else Py_DECREF(synword);
+        }
        Py_DECREF(word);
    }
@@ -279,6 +247,10 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
    Py_DECREF(doc1);
    return 1;
+ err:
+    Py_DECREF(doc1);
+    return -1;
 }
@@ -304,12 +276,9 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o)
 {
    PyUnicodeObject *u;
-    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, o->length);
+    u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length);
-    if (u == NULL) return NULL;
+    if (u != NULL)
-    Py_UNICODE_COPY(u->str, o->str, o->length);
      fixlower(u);
    return  u;
 }
@@ -317,7 +286,7 @@ static char *splitter_args[]={"doc","synstop","encoding",NULL};
 static PyObject *
-get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
+newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
 {
    Splitter *self=NULL;
    PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
@@ -349,17 +318,13 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
        return NULL;
    }
    if (synstop) {
        self->synstop = synstop;
        Py_INCREF(synstop);
    } else  self->synstop=NULL;
-    if (! (splitUnicodeString(self,(PyUnicodeObject *)unicodedoc))) {
+    if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
      goto err;
-    }
    Py_DECREF(unicodedoc);
    return (PyObject*)self;
@@ -373,8 +338,10 @@ err:
 static struct PyMethodDef Splitter_module_methods[] =
    {
-        { "UnicodeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
+        { "UnicodeSplitter", (PyCFunction)newSplitter,
-            "UnicodeSplitter(doc[,synstop][,encoding='latin1']) -- Return a word splitter"
+          METH_VARARGS|METH_KEYWORDS,
+          "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
+          "-- Return a word splitter"
        },
        { NULL, NULL }
    };
@@ -384,7 +351,7 @@ static char Splitter_module_documentation[] =
    "\n"
    "for use in an inverted index\n"
    "\n"
-    "$Id: UnicodeSplitter.c,v 1.7 2001/10/18 15:56:20 andreasjung Exp $\n"
+    "$Id: UnicodeSplitter.c,v 1.8 2001/10/19 20:08:05 shane Exp $\n"
    ;
@@ -392,7 +359,7 @@ void
 initUnicodeSplitter(void)
 {
    PyObject *m, *d;
-    char *rev="$Revision: 1.7 $";
+    char *rev="$Revision: 1.8 $";
    /* Create the module and add the functions */
    m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,