Commit a989ae56 authored by Andreas Jung's avatar Andreas Jung

introducing new 'casefolding' parameter

parent 97f1e13c
......@@ -35,6 +35,7 @@ typedef struct
int allow_single_chars;
int index_numbers;
int max_len;
int casefolding;
}
Splitter;
......@@ -251,7 +252,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
continue;
}
c=mytolower(*here);
if (self->casefolding)
c=mytolower(*here);
else
c = (*here);
/* Check to see if this character is part of a word */
......@@ -490,7 +494,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
......@@ -501,8 +505,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
int single_char = 0;
int index_numbers = 0;
int max_len=64;
int casefolding=1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len,&casefolding)) return NULL;
if (index_numbers<0 || index_numbers>1) {
......@@ -510,6 +515,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
......@@ -521,7 +531,6 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop) {
......@@ -539,6 +548,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->casefolding = casefolding;
self->index = -1;
......@@ -553,7 +563,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{ NULL, NULL }
......@@ -564,7 +574,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
"$Id: ISO_8859_1_Splitter.c,v 1.7 2002/01/21 19:28:55 andreasjung Exp $\n"
;
......@@ -573,20 +583,22 @@ void
initISO_8859_1_Splitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.6 $";
char *rev="$Revision: 1.7 $";
/* Create the module and add the functions */
initSplitterTrtabs();
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 1");
m = Py_InitModule4("ISO_8859_1_Splitter", Splitter_module_methods,
Splitter_module_documentation,
(PyObject*)NULL,PYTHON_API_VERSION);
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 2");
/* Add some symbolic constants to the module */
d = PyModule_GetDict(m);
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 3");
PyDict_SetItemString(d, "__version__",
PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 4");
if (PyErr_Occurred())
Py_FatalError("can't initialize module Splitter");
}
......@@ -25,11 +25,12 @@ typedef struct
int max_len;
int allow_single_chars;
int index_numbers;
int casefolding;
}
Splitter;
static
PyUnicodeObject *prepareString(PyUnicodeObject *o);
PyUnicodeObject *prepareString(Splitter *self, PyUnicodeObject *o);
static PyObject *checkSynword(Splitter *self, PyObject *word)
{
......@@ -201,7 +202,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
int i=0;
int start=0;
doc1 = prepareString(doc);
doc1 = prepareString(self,doc);
if (doc1 == NULL)
return -1;
......@@ -297,18 +298,20 @@ void fixlower(PyUnicodeObject *self)
static
PyUnicodeObject *prepareString(PyUnicodeObject *o)
PyUnicodeObject *prepareString(Splitter *self,PyUnicodeObject *o)
{
PyUnicodeObject *u;
u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length);
if (u != NULL)
fixlower(u);
if (u != NULL){
if (self->casefolding)
fixlower(u);
}
return u;
}
static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen","casefolding",NULL};
static PyObject *
......@@ -320,8 +323,9 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
int index_numbers = 0;
int max_len=64;
int single_char = 0;
int casefolding=1;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len,&casefolding))) return NULL;
#ifdef DEBUG
puts("got text");
......@@ -334,6 +338,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
......@@ -371,6 +380,7 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
self->index_numbers = index_numbers;
self->max_len = max_len;
self->allow_single_chars = single_char;
self->casefolding = casefolding;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err;
......@@ -389,7 +399,7 @@ static struct PyMethodDef Splitter_module_methods[] =
{
{ "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
"UnicodeSplitter(doc[,synstop][,encoding='latin1'][,indexnumbers][,maxlen][,singlechar][,casefolding]) "
"-- Return a word splitter"
},
{ NULL, NULL }
......@@ -400,7 +410,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n"
"$Id: UnicodeSplitter.c,v 1.14 2002/01/21 19:28:55 andreasjung Exp $\n"
;
......@@ -408,7 +418,7 @@ void
initUnicodeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.13 $";
char *rev="$Revision: 1.14 $";
/* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
......
......@@ -28,6 +28,7 @@ typedef struct
int allow_single_chars;
int index_numbers;
int max_len;
int casefolding;
}
Splitter;
......@@ -170,7 +171,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
continue;
}
c=tolower((unsigned char) *here);
if (self->casefolding)
c = tolower((unsigned char) *here);
else
c = (unsigned char) *here;
/* Check to see if this character is part of a word */
......@@ -435,7 +439,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject *
......@@ -447,9 +451,17 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
int single_char = 0;
int index_numbers = 0;
int max_len= 64;
int casefolding = 1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args, \
&doc,
&synstop,
&encoding,
&single_char,
&index_numbers,
&max_len,
&casefolding
)) return NULL;
if (index_numbers<0 || index_numbers>1) {
......@@ -457,6 +469,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
......@@ -486,6 +503,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->casefolding = casefolding;
return (PyObject*)self;
......@@ -498,7 +516,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{ NULL, NULL }
......@@ -509,7 +527,7 @@ static char Splitter_module_documentation[] =
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
"$Id: ZopeSplitter.c,v 1.7 2002/01/21 19:28:55 andreasjung Exp $\n"
;
......@@ -517,7 +535,7 @@ void
initZopeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.6 $";
char *rev="$Revision: 1.7 $";
/* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment