Commit 17f69863 authored by Andreas Jung's avatar Andreas Jung

added 3 new parameters for all zope splitters

parent fc443b19
......@@ -27,6 +27,19 @@ Zope Changes
Features Added
- TextIndex/Splitters: the constructor of all three splitters
has now three new optional parameters:
'maxlen'=(1-256) - to specify the maximum length of
splitted words
'singlechar'=(1|0) - allows single characters to be indexed
'indexnumbers'=(1|0)- allows numbers to be indexed
The default values of all parameters reflect the standard
behaviour.
- Enhancements to utilites/requestprofiler.py:
Added readstats and writestats features which allow for saves and
......
......@@ -32,6 +32,9 @@ typedef struct
PyObject *text, *synstop;
char *here, *end;
int index;
int allow_single_chars;
int index_numbers;
int max_len;
}
Splitter;
......@@ -117,6 +120,32 @@ Splitter_length(Splitter *self)
return self->index+1;
}
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject *
Splitter_concat(Splitter *self, PyObject *other)
{
......@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word)
len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */
if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
......@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
if (len < 0) {
if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
......@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */
}
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
char wbuf[MAX_WORD];
char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
......@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0)
*startpos=here;
if(i++ < MAX_WORD)
if(i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
......@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
......@@ -416,6 +444,9 @@ err:
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split the string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token"
},
......@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding",NULL};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
......@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self;
PyObject *doc, *synstop = NULL;
char * encoding="latin1";
int single_char = 0;
int index_numbers = 0;
int max_len=64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
......@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text);
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->index = -1;
......@@ -498,7 +553,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter"
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
},
{ NULL, NULL }
......@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ISO_8859_1_Splitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n"
"$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
;
......@@ -518,7 +573,7 @@ void
initISO_8859_1_Splitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.5 $";
char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */
initSplitterTrtabs();
......
......@@ -13,8 +13,6 @@
#include "Python.h"
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
......@@ -24,8 +22,12 @@ typedef struct
PyObject_HEAD
PyObject *list;
PyObject *synstop;
int max_len;
int allow_single_chars;
int index_numbers;
}
Splitter;
static
PyUnicodeObject *prepareString(PyUnicodeObject *o);
......@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word)
/* Always returns a borrowed reference */
PyObject *value;
if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
return Py_None;
if (self->synstop) {
value = PyDict_GetItem(self->synstop,word);
if (value != NULL) {
......@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i)
return item;
}
static PyObject *
Splitter_split(Splitter *self) {
Py_INCREF(self->list);
return self->list;
}
static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
......@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args)
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction) Splitter_split, 0,
"split() -- Split string in one run" },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence",
},
......@@ -198,14 +213,19 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
register Py_UNICODE ch;
ch = *s;
#ifdef DEBUG
printf("%d %c %d\n",i,ch,ch);
fflush(stdout);
#endif
if (!inside_word) {
if (Py_UNICODE_ISALPHA(ch)) {
inside_word=1;
start = i;
if (self->index_numbers) {
if (Py_UNICODE_ISALNUM(ch)) {
inside_word=1;
start = i;
}
} else {
if (Py_UNICODE_ISALPHA(ch)) {
inside_word=1;
start = i;
}
}
} else {
......@@ -213,7 +233,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start,
min(i, start + MAX_WORD));
min(i, start + self->max_len));
if (word==NULL)
goto err;
......@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start,
min(len, start + MAX_WORD));
min(len, start + self->max_len));
if (word==NULL)
goto err;
......@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o)
return u;
}
static char *splitter_args[]={"doc","synstop","encoding",NULL};
static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static PyObject *
......@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1";
int index_numbers = 0;
int max_len=64;
int single_char = 0;
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
#ifdef DEBUG
puts("got text");
......@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
fflush(stdout);
#endif
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
......@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL;
}
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
} else self->synstop=NULL;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->allow_single_chars = single_char;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err;
......@@ -344,11 +387,6 @@ err:
static struct PyMethodDef Splitter_module_methods[] =
{
{ "pos", (PyCFunction) Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" },
{ "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in sequence" },
{ "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
......@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: UnicodeSplitter.c,v 1.12 2001/11/28 15:51:04 matt Exp $\n"
"$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n"
;
......@@ -370,7 +408,7 @@ void
initUnicodeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.12 $";
char *rev="$Revision: 1.13 $";
/* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
......
/*****************************************************************************
Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
......@@ -10,6 +10,8 @@
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
......@@ -23,6 +25,9 @@ typedef struct
PyObject *text, *synstop;
char *here, *end;
int index;
int allow_single_chars;
int index_numbers;
int max_len;
}
Splitter;
......@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word)
cword = PyString_AsString(word);
len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */
if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
......@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
if (len < 0) {
if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
......@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */
}
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
char wbuf[MAX_WORD];
char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
......@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0)
*startpos=here;
if(i++ < MAX_WORD)
if(i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
......@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
......@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i)
return word;
}
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
......@@ -282,14 +311,14 @@ Splitter_slice(Splitter *self, int i, int j)
}
static PySequenceMethods Splitter_as_sequence = {
(inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
(inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
......@@ -359,8 +388,12 @@ err:
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split complete string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token"
"pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
......@@ -378,31 +411,31 @@ Splitter_getattr(Splitter *self, char *name)
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding",NULL};
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject *
......@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
Splitter *self;
PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
int single_char = 0;
int index_numbers = 0;
int max_len= 64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
......@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
self->end = self->here + PyString_Size(self->text);
self->index = -1;
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
return (PyObject*)self;
......@@ -442,7 +498,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter"
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
},
{ NULL, NULL }
......@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ZopeSplitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n"
"$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
;
......@@ -461,7 +517,7 @@ void
initZopeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.5 $";
char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment