Objects to be indexed with a TextIndex can now define an additional

attribute column+'_encoding' to specify an encoding other than 'latin1'. This encoding is needed when the UnicodeSplitter is used to convert a Python string to unicode. Not setting the column+"_encoding attribute retains the standard behaviour.

Objects to be indexed with a TextIndex can now define an additional
attribute column+'_encoding' to specify an encoding other than 'latin1'. This encoding is needed when the UnicodeSplitter is used to convert a Python string to unicode. Not setting the column+"_encoding attribute retains the standard behaviour.
52758876 · Andreas Jung · 264c7750 · 52758876 · 52758876 · 52758876
Commit 52758876 authored Oct 17, 2001 by Andreas Jung
5 changed files
--- a/lib/python/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
@@ -303,13 +303,13 @@ class GlobbingLexicon(Lexicon):

        return q

-    def Splitter(self, astring, words=None):
+    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

-        return self.SplitterFunc(astring)
+        return self.SplitterFunc(astring,encoding=encoding)


    def createRegex(self, pat):

--- a/lib/python/Products/PluginIndexes/TextIndex/Lexicon.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/Lexicon.py
@@ -223,11 +223,11 @@ class Lexicon(Persistent, Implicit):
        return len(self._lexicon)


-    def Splitter(self, astring, words=None):
+    def Splitter(self, astring, words=None, encoding = "latin1"):
        """ wrap the splitter """
        if words is None:
            words = self.stop_syn
-        return self.SplitterFunc(astring, words)
+        return self.SplitterFunc(astring, words, encoding)


    def query_hook(self, q):

--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/__init__.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/__init__.py
-from ISO_8859_1_Splitter import ISO_8859_1_Splitter as Splitter
+from ISO_8859_1_Splitter import ISO_8859_1_Splitter
+
+def Splitter(txt,stopwords=None,encoding='latin1'):
+    return ISO_8859_1_Splitter(txt,stopwords)
--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/__init__.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/__init__.py
-from ZopeSplitter import ZopeSplitter as Splitter
+from ZopeSplitter import ZopeSplitter 
+
+def Splitter(txt,stopwords={},encoding="latin1"):
+    return ZopeSplitter(txt,stopwords)
--- a/lib/python/Products/PluginIndexes/TextIndex/TextIndex.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/TextIndex.py
@@ -87,7 +87,7 @@

 """

-__version__ = '$Revision: 1.20 $'[11:-2]
+__version__ = '$Revision: 1.21 $'[11:-2]


 import string, re
@@ -355,6 +355,18 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
                source = str(source)
        except (AttributeError, TypeError):
            return 0
+
+        # sniff the object for 'id'+'_encoding'
+        
+        try:
+            encoding = getattr(obj, self.id+'_encoding')
+            if callable(encoding ):
+                encoding = str(encoding())
+            else:
+                encoding = str(encoding)
+        except (AttributeError, TypeError):
+            encoding = 'latin1'
+
        
        lexicon = self.getLexicon()

@@ -365,7 +377,7 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
        
        # Run through the words and score them

-        for word in list(splitter(source)):
+        for word in list(splitter(source,encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else: