Commit a989ae56 authored by Andreas Jung's avatar Andreas Jung

introducing new 'casefolding' parameter

parent 97f1e13c
...@@ -35,6 +35,7 @@ typedef struct ...@@ -35,6 +35,7 @@ typedef struct
int allow_single_chars; int allow_single_chars;
int index_numbers; int index_numbers;
int max_len; int max_len;
int casefolding;
} }
Splitter; Splitter;
...@@ -251,7 +252,10 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -251,7 +252,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
continue; continue;
} }
c=mytolower(*here); if (self->casefolding)
c=mytolower(*here);
else
c = (*here);
/* Check to see if this character is part of a word */ /* Check to see if this character is part of a word */
...@@ -490,7 +494,7 @@ static PyTypeObject SplitterType = { ...@@ -490,7 +494,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject * static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
...@@ -501,8 +505,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -501,8 +505,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
int single_char = 0; int single_char = 0;
int index_numbers = 0; int index_numbers = 0;
int max_len=64; int max_len=64;
int casefolding=1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL; UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len,&casefolding)) return NULL;
if (index_numbers<0 || index_numbers>1) { if (index_numbers<0 || index_numbers>1) {
...@@ -510,6 +515,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -510,6 +515,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL; return NULL;
} }
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) { if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1"); PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL; return NULL;
...@@ -521,7 +531,6 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -521,7 +531,6 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
} }
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop) { if(synstop) {
...@@ -539,6 +548,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -539,6 +548,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
self->allow_single_chars = single_char; self->allow_single_chars = single_char;
self->index_numbers = index_numbers; self->index_numbers = index_numbers;
self->max_len = max_len; self->max_len = max_len;
self->casefolding = casefolding;
self->index = -1; self->index = -1;
...@@ -553,7 +563,7 @@ err: ...@@ -553,7 +563,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter" "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -564,7 +574,7 @@ static char Splitter_module_documentation[] = ...@@ -564,7 +574,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n" "$Id: ISO_8859_1_Splitter.c,v 1.7 2002/01/21 19:28:55 andreasjung Exp $\n"
; ;
...@@ -573,20 +583,22 @@ void ...@@ -573,20 +583,22 @@ void
initISO_8859_1_Splitter(void) initISO_8859_1_Splitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.6 $"; char *rev="$Revision: 1.7 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
initSplitterTrtabs(); initSplitterTrtabs();
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 1");
m = Py_InitModule4("ISO_8859_1_Splitter", Splitter_module_methods, m = Py_InitModule4("ISO_8859_1_Splitter", Splitter_module_methods,
Splitter_module_documentation, Splitter_module_documentation,
(PyObject*)NULL,PYTHON_API_VERSION); (PyObject*)NULL,PYTHON_API_VERSION);
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 2");
/* Add some symbolic constants to the module */ /* Add some symbolic constants to the module */
d = PyModule_GetDict(m); d = PyModule_GetDict(m);
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 3");
PyDict_SetItemString(d, "__version__", PyDict_SetItemString(d, "__version__",
PyString_FromStringAndSize(rev+11,strlen(rev+11)-2)); PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter 4");
if (PyErr_Occurred())
Py_FatalError("can't initialize module Splitter");
} }
...@@ -25,11 +25,12 @@ typedef struct ...@@ -25,11 +25,12 @@ typedef struct
int max_len; int max_len;
int allow_single_chars; int allow_single_chars;
int index_numbers; int index_numbers;
int casefolding;
} }
Splitter; Splitter;
static static
PyUnicodeObject *prepareString(PyUnicodeObject *o); PyUnicodeObject *prepareString(Splitter *self, PyUnicodeObject *o);
static PyObject *checkSynword(Splitter *self, PyObject *word) static PyObject *checkSynword(Splitter *self, PyObject *word)
{ {
...@@ -201,7 +202,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -201,7 +202,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
int i=0; int i=0;
int start=0; int start=0;
doc1 = prepareString(doc); doc1 = prepareString(self,doc);
if (doc1 == NULL) if (doc1 == NULL)
return -1; return -1;
...@@ -297,18 +298,20 @@ void fixlower(PyUnicodeObject *self) ...@@ -297,18 +298,20 @@ void fixlower(PyUnicodeObject *self)
static static
PyUnicodeObject *prepareString(PyUnicodeObject *o) PyUnicodeObject *prepareString(Splitter *self,PyUnicodeObject *o)
{ {
PyUnicodeObject *u; PyUnicodeObject *u;
u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length); u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length);
if (u != NULL) if (u != NULL){
fixlower(u); if (self->casefolding)
fixlower(u);
}
return u; return u;
} }
static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL}; static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen","casefolding",NULL};
static PyObject * static PyObject *
...@@ -320,8 +323,9 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -320,8 +323,9 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
int index_numbers = 0; int index_numbers = 0;
int max_len=64; int max_len=64;
int single_char = 0; int single_char = 0;
int casefolding=1;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL; if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len,&casefolding))) return NULL;
#ifdef DEBUG #ifdef DEBUG
puts("got text"); puts("got text");
...@@ -334,6 +338,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -334,6 +338,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL; return NULL;
} }
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) { if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1"); PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL; return NULL;
...@@ -371,6 +380,7 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -371,6 +380,7 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
self->index_numbers = index_numbers; self->index_numbers = index_numbers;
self->max_len = max_len; self->max_len = max_len;
self->allow_single_chars = single_char; self->allow_single_chars = single_char;
self->casefolding = casefolding;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0) if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err; goto err;
...@@ -389,7 +399,7 @@ static struct PyMethodDef Splitter_module_methods[] = ...@@ -389,7 +399,7 @@ static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "UnicodeSplitter", (PyCFunction)newSplitter, { "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS, METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) " "UnicodeSplitter(doc[,synstop][,encoding='latin1'][,indexnumbers][,maxlen][,singlechar][,casefolding]) "
"-- Return a word splitter" "-- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -400,7 +410,7 @@ static char Splitter_module_documentation[] = ...@@ -400,7 +410,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n" "$Id: UnicodeSplitter.c,v 1.14 2002/01/21 19:28:55 andreasjung Exp $\n"
; ;
...@@ -408,7 +418,7 @@ void ...@@ -408,7 +418,7 @@ void
initUnicodeSplitter(void) initUnicodeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.13 $"; char *rev="$Revision: 1.14 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods, m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
......
...@@ -28,6 +28,7 @@ typedef struct ...@@ -28,6 +28,7 @@ typedef struct
int allow_single_chars; int allow_single_chars;
int index_numbers; int index_numbers;
int max_len; int max_len;
int casefolding;
} }
Splitter; Splitter;
...@@ -170,7 +171,10 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -170,7 +171,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
continue; continue;
} }
c=tolower((unsigned char) *here); if (self->casefolding)
c = tolower((unsigned char) *here);
else
c = (unsigned char) *here;
/* Check to see if this character is part of a word */ /* Check to see if this character is part of a word */
...@@ -435,7 +439,7 @@ static PyTypeObject SplitterType = { ...@@ -435,7 +439,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject * static PyObject *
...@@ -447,9 +451,17 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -447,9 +451,17 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
int single_char = 0; int single_char = 0;
int index_numbers = 0; int index_numbers = 0;
int max_len= 64; int max_len= 64;
int casefolding = 1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \ UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args, \
&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL; &doc,
&synstop,
&encoding,
&single_char,
&index_numbers,
&max_len,
&casefolding
)) return NULL;
if (index_numbers<0 || index_numbers>1) { if (index_numbers<0 || index_numbers>1) {
...@@ -457,6 +469,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -457,6 +469,11 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
return NULL; return NULL;
} }
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) { if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1"); PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL; return NULL;
...@@ -486,6 +503,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -486,6 +503,7 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
self->allow_single_chars = single_char; self->allow_single_chars = single_char;
self->index_numbers = index_numbers; self->index_numbers = index_numbers;
self->max_len = max_len; self->max_len = max_len;
self->casefolding = casefolding;
return (PyObject*)self; return (PyObject*)self;
...@@ -498,7 +516,7 @@ err: ...@@ -498,7 +516,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter" "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -509,7 +527,7 @@ static char Splitter_module_documentation[] = ...@@ -509,7 +527,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n" "$Id: ZopeSplitter.c,v 1.7 2002/01/21 19:28:55 andreasjung Exp $\n"
; ;
...@@ -517,7 +535,7 @@ void ...@@ -517,7 +535,7 @@ void
initZopeSplitter(void) initZopeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.6 $"; char *rev="$Revision: 1.7 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods, m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment