Commit 52758876 authored by Andreas Jung's avatar Andreas Jung

Objects to be indexed with a TextIndex can now define an additional

attribute column+'_encoding' to specify an encoding other than 'latin1'.
This encoding is needed when the UnicodeSplitter is used to convert
a Python string to unicode. Not setting the column+"_encoding attribute
retains the standard behaviour.
parent 264c7750
......@@ -303,13 +303,13 @@ class GlobbingLexicon(Lexicon):
return q
def Splitter(self, astring, words=None):
def Splitter(self, astring, words=None, encoding="latin1"):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
return self.SplitterFunc(astring)
return self.SplitterFunc(astring,encoding=encoding)
def createRegex(self, pat):
......
......@@ -223,11 +223,11 @@ class Lexicon(Persistent, Implicit):
return len(self._lexicon)
def Splitter(self, astring, words=None):
def Splitter(self, astring, words=None, encoding = "latin1"):
""" wrap the splitter """
if words is None:
words = self.stop_syn
return self.SplitterFunc(astring, words)
return self.SplitterFunc(astring, words, encoding)
def query_hook(self, q):
......
from ISO_8859_1_Splitter import ISO_8859_1_Splitter as Splitter
from ISO_8859_1_Splitter import ISO_8859_1_Splitter
def Splitter(txt,stopwords=None,encoding='latin1'):
return ISO_8859_1_Splitter(txt,stopwords)
from ZopeSplitter import ZopeSplitter as Splitter
from ZopeSplitter import ZopeSplitter
def Splitter(txt,stopwords={},encoding="latin1"):
return ZopeSplitter(txt,stopwords)
......@@ -87,7 +87,7 @@
"""
__version__ = '$Revision: 1.20 $'[11:-2]
__version__ = '$Revision: 1.21 $'[11:-2]
import string, re
......@@ -355,6 +355,18 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
source = str(source)
except (AttributeError, TypeError):
return 0
# sniff the object for 'id'+'_encoding'
try:
encoding = getattr(obj, self.id+'_encoding')
if callable(encoding ):
encoding = str(encoding())
else:
encoding = str(encoding)
except (AttributeError, TypeError):
encoding = 'latin1'
lexicon = self.getLexicon()
......@@ -365,7 +377,7 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
# Run through the words and score them
for word in list(splitter(source)):
for word in list(splitter(source,encoding=encoding)):
if word[0] == '\"':
last = self._subindex(word[1:-1], wordScores, last, splitter)
else:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment