Commit 17f69863 authored by Andreas Jung's avatar Andreas Jung

added 3 new parameters for all zope splitters

parent fc443b19
...@@ -27,6 +27,19 @@ Zope Changes ...@@ -27,6 +27,19 @@ Zope Changes
Features Added Features Added
- TextIndex/Splitters: the constructor of all three splitters
has now three new optional parameters:
'maxlen'=(1-256) - to specify the maximum length of
splitted words
'singlechar'=(1|0) - allows single characters to be indexed
'indexnumbers'=(1|0)- allows numbers to be indexed
The default values of all parameters reflect the standard
behaviour.
- Enhancements to utilites/requestprofiler.py: - Enhancements to utilites/requestprofiler.py:
Added readstats and writestats features which allow for saves and Added readstats and writestats features which allow for saves and
......
...@@ -32,6 +32,9 @@ typedef struct ...@@ -32,6 +32,9 @@ typedef struct
PyObject *text, *synstop; PyObject *text, *synstop;
char *here, *end; char *here, *end;
int index; int index;
int allow_single_chars;
int index_numbers;
int max_len;
} }
Splitter; Splitter;
...@@ -117,6 +120,32 @@ Splitter_length(Splitter *self) ...@@ -117,6 +120,32 @@ Splitter_length(Splitter *self)
return self->index+1; return self->index+1;
} }
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject * static PyObject *
Splitter_concat(Splitter *self, PyObject *other) Splitter_concat(Splitter *self, PyObject *other)
{ {
...@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word)
len = PyString_Size(word); len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{ {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
...@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); ) for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
; ;
if (len < 0) { if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */ return value; /* Which must be None! */
} }
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject * static PyObject *
next_word(Splitter *self, char **startpos, char **endpos) next_word(Splitter *self, char **startpos, char **endpos)
{ {
char wbuf[MAX_WORD]; char wbuf[256];
char *end, *here, *b; char *end, *here, *b;
int i = 0, c; int i = 0, c;
PyObject *pyword, *res; PyObject *pyword, *res;
...@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0) if(startpos && i==0)
*startpos=here; *startpos=here;
if(i++ < MAX_WORD) if(i++ < self->max_len)
*b++ = c; *b++ = c;
} else if (i != 0) { /* We've found the end of a word */ } else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) { UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here; self->here=here;
...@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */ /* We've reached the end of the string */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
if (i == 0) { if (i == 0) {
/* No words */ /* No words */
...@@ -416,6 +444,9 @@ err: ...@@ -416,6 +444,9 @@ err:
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split the string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0, { "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" "pos(index) -- Return the starting and ending position of a token"
}, },
...@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = { ...@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject * static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
...@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self; Splitter *self;
PyObject *doc, *synstop = NULL; PyObject *doc, *synstop = NULL;
char * encoding="latin1"; char * encoding="latin1";
int single_char = 0;
int index_numbers = 0;
int max_len=64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
...@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
UNLESS(self->here=PyString_AsString(self->text)) goto err; UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text); self->end = self->here + PyString_Size(self->text);
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->index = -1; self->index = -1;
...@@ -498,7 +553,7 @@ err: ...@@ -498,7 +553,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter" "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] = ...@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ISO_8859_1_Splitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n" "$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -518,7 +573,7 @@ void ...@@ -518,7 +573,7 @@ void
initISO_8859_1_Splitter(void) initISO_8859_1_Splitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.5 $"; char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
initSplitterTrtabs(); initSplitterTrtabs();
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
#include "Python.h" #include "Python.h"
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
#ifndef min #ifndef min
#define min(a,b) ((a)<(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b))
#endif #endif
...@@ -24,8 +22,12 @@ typedef struct ...@@ -24,8 +22,12 @@ typedef struct
PyObject_HEAD PyObject_HEAD
PyObject *list; PyObject *list;
PyObject *synstop; PyObject *synstop;
int max_len;
int allow_single_chars;
int index_numbers;
} }
Splitter; Splitter;
static static
PyUnicodeObject *prepareString(PyUnicodeObject *o); PyUnicodeObject *prepareString(PyUnicodeObject *o);
...@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word) ...@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word)
/* Always returns a borrowed reference */ /* Always returns a borrowed reference */
PyObject *value; PyObject *value;
if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
return Py_None;
if (self->synstop) { if (self->synstop) {
value = PyDict_GetItem(self->synstop,word); value = PyDict_GetItem(self->synstop,word);
if (value != NULL) { if (value != NULL) {
...@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i) ...@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i)
return item; return item;
} }
static PyObject *
Splitter_split(Splitter *self) {
Py_INCREF(self->list);
return self->list;
}
static PyObject * static PyObject *
Splitter_indexes(Splitter *self, PyObject *args) Splitter_indexes(Splitter *self, PyObject *args)
...@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args) ...@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args)
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction) Splitter_split, 0,
"split() -- Split string in one run" },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS, { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence", "indexes(word) -- Return a list of the indexes of word in the sequence",
}, },
...@@ -198,14 +213,19 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -198,14 +213,19 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
register Py_UNICODE ch; register Py_UNICODE ch;
ch = *s; ch = *s;
#ifdef DEBUG
printf("%d %c %d\n",i,ch,ch);
fflush(stdout);
#endif
if (!inside_word) { if (!inside_word) {
if (Py_UNICODE_ISALPHA(ch)) { if (self->index_numbers) {
inside_word=1; if (Py_UNICODE_ISALNUM(ch)) {
start = i; inside_word=1;
start = i;
}
} else {
if (Py_UNICODE_ISALPHA(ch)) {
inside_word=1;
start = i;
}
} }
} else { } else {
...@@ -213,7 +233,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -213,7 +233,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
inside_word = 0; inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start, word = PySequence_GetSlice((PyObject *)doc1,start,
min(i, start + MAX_WORD)); min(i, start + self->max_len));
if (word==NULL) if (word==NULL)
goto err; goto err;
...@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
if (inside_word) { if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start, word = PySequence_GetSlice((PyObject *)doc1,start,
min(len, start + MAX_WORD)); min(len, start + self->max_len));
if (word==NULL) if (word==NULL)
goto err; goto err;
...@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o) ...@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o)
return u; return u;
} }
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static PyObject * static PyObject *
...@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self=NULL; Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL; PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1"; char *encoding = "latin1";
int index_numbers = 0;
int max_len=64;
int single_char = 0;
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL; if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
#ifdef DEBUG #ifdef DEBUG
puts("got text"); puts("got text");
...@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
fflush(stdout); fflush(stdout);
#endif #endif
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
if (PyString_Check(doc)) { if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict"); unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
...@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL; return NULL;
} }
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
if (synstop) { if (synstop) {
self->synstop = synstop; self->synstop = synstop;
Py_INCREF(synstop); Py_INCREF(synstop);
} else self->synstop=NULL; } else self->synstop=NULL;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->allow_single_chars = single_char;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0) if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err; goto err;
...@@ -344,11 +387,6 @@ err: ...@@ -344,11 +387,6 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "pos", (PyCFunction) Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" },
{ "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in sequence" },
{ "UnicodeSplitter", (PyCFunction)newSplitter, { "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS, METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) " "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
...@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] = ...@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: UnicodeSplitter.c,v 1.12 2001/11/28 15:51:04 matt Exp $\n" "$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -370,7 +408,7 @@ void ...@@ -370,7 +408,7 @@ void
initUnicodeSplitter(void) initUnicodeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.12 $"; char *rev="$Revision: 1.13 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods, m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
......
/***************************************************************************** /*****************************************************************************
Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved. Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License, This software is subject to the provisions of the Zope Public License,
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
FOR A PARTICULAR PURPOSE FOR A PARTICULAR PURPOSE
****************************************************************************/ ****************************************************************************/
#include "Python.h" #include "Python.h"
#include <ctype.h> #include <ctype.h>
...@@ -23,6 +25,9 @@ typedef struct ...@@ -23,6 +25,9 @@ typedef struct
PyObject *text, *synstop; PyObject *text, *synstop;
char *here, *end; char *here, *end;
int index; int index;
int allow_single_chars;
int index_numbers;
int max_len;
} }
Splitter; Splitter;
...@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word)
cword = PyString_AsString(word); cword = PyString_AsString(word);
len = PyString_Size(word); len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{ {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
...@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); ) for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
; ;
if (len < 0) { if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */ return value; /* Which must be None! */
} }
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject * static PyObject *
next_word(Splitter *self, char **startpos, char **endpos) next_word(Splitter *self, char **startpos, char **endpos)
{ {
char wbuf[MAX_WORD]; char wbuf[256];
char *end, *here, *b; char *end, *here, *b;
int i = 0, c; int i = 0, c;
PyObject *pyword, *res; PyObject *pyword, *res;
...@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0) if(startpos && i==0)
*startpos=here; *startpos=here;
if(i++ < MAX_WORD) if(i++ < self->max_len)
*b++ = c; *b++ = c;
} else if (i != 0) { /* We've found the end of a word */ } else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) { UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here; self->here=here;
...@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */ /* We've reached the end of the string */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
if (i == 0) { if (i == 0) {
/* No words */ /* No words */
...@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i) ...@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i)
return word; return word;
} }
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject * static PyObject *
Splitter_slice(Splitter *self, int i, int j) Splitter_slice(Splitter *self, int i, int j)
{ {
...@@ -282,14 +311,14 @@ Splitter_slice(Splitter *self, int i, int j) ...@@ -282,14 +311,14 @@ Splitter_slice(Splitter *self, int i, int j)
} }
static PySequenceMethods Splitter_as_sequence = { static PySequenceMethods Splitter_as_sequence = {
(inquiry)Splitter_length, /*sq_length*/ (inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/ (binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/ (intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/ (intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/ (intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/ (intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/ (intintobjargproc)0, /*sq_ass_slice*/
}; };
static PyObject * static PyObject *
Splitter_pos(Splitter *self, PyObject *args) Splitter_pos(Splitter *self, PyObject *args)
...@@ -359,8 +388,12 @@ err: ...@@ -359,8 +388,12 @@ err:
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split complete string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0, { "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" "pos(index) -- Return the starting and ending position of a token"
}, },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS, { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
...@@ -378,31 +411,31 @@ Splitter_getattr(Splitter *self, char *name) ...@@ -378,31 +411,31 @@ Splitter_getattr(Splitter *self, char *name)
static char SplitterType__doc__[] = ""; static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = { static PyTypeObject SplitterType = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, /*ob_size*/ 0, /*ob_size*/
"Splitter", /*tp_name*/ "Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/ sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/ 0, /*tp_itemsize*/
/* methods */ /* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/ (destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/ (printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/ (getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/ (setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/ (cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/ (reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/ 0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/ &Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/ 0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/ (hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/ (ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/ (reprfunc)0, /*tp_str*/
/* Space for future expansion */ /* Space for future expansion */
0L,0L,0L,0L, 0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject * static PyObject *
...@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
Splitter *self; Splitter *self;
PyObject *doc, *synstop = NULL; PyObject *doc, *synstop = NULL;
char *encoding = "latin1"; char *encoding = "latin1";
int single_char = 0;
int index_numbers = 0;
int max_len= 64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL; if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
...@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
self->end = self->here + PyString_Size(self->text); self->end = self->here + PyString_Size(self->text);
self->index = -1; self->index = -1;
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
return (PyObject*)self; return (PyObject*)self;
...@@ -442,7 +498,7 @@ err: ...@@ -442,7 +498,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter" "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] = ...@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ZopeSplitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n" "$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -461,7 +517,7 @@ void ...@@ -461,7 +517,7 @@ void
initZopeSplitter(void) initZopeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.5 $"; char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods, m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment