Merge TextIndex fixes from 2.4 branch

3d88c027 · Evan Simpson · 233671d4 · 3d88c027 · 3d88c027 · 3d88c027
Commit 3d88c027 authored Jul 30, 2001 by Evan Simpson
6 changed files
--- a/lib/python/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
@@ -85,7 +85,7 @@

 from Lexicon import Lexicon
 import Splitter
-from Products.PluginIndexes.TextIndex.TextIndex import Or
+from TextIndex import Or, Op

 import re, string

@@ -147,14 +147,12 @@ class GlobbingLexicon(Lexicon):

    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""
-        digrams = []
-
-        digrams.append(self.eow + word[0])    # Mark the beginning
-
-        for i in range(1,len(word)):
-            digrams.append(word[i-1:i+1])
+        digrams = list(word)
+        digrams.append(self.eow)
+        last = self.eow

-        digrams[-1] = digrams[-1] + self.eow  # Mark the end
+        for i in range(len(digrams)):
+            last, digrams[i] = digrams[i], last + digrams[i]

        return digrams

@@ -269,21 +267,30 @@ class GlobbingLexicon(Lexicon):

    def query_hook(self, q):
        """expand wildcards"""
-        words = []
-        for w in q:
-            if ( (self.multi_wc in w) or
-                 (self.single_wc in w) ):
-                wids = self.get(w)
+        ListType = type([])
+        i = len(q) - 1
+        while i >= 0:
+            e = q[i]
+            if isinstance(e, ListType):
+                self.query_hook(e)
+            elif isinstance(e, Op):
+                pass
+            elif ( (self.multi_wc in e) or
+                   (self.single_wc in e) ):
+                wids = self.get(e)
+                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
-            else:
-                words.append(w)
+                if not words:
+                    # if words is empty, return something that will make
+                    # textindex's __getitem__ return an empty result list
+                    words.append('')
+                q[i] = words
+            i = i - 1

-        # if words is empty, return something that will make textindex's
-        # __getitem__ return an empty result list
-        return words or ['']
+        return q

    def Splitter(self, astring, words=None):
        """ wrap the splitter """
@@ -300,18 +307,16 @@ class GlobbingLexicon(Lexicon):
        There is no way to quote meta-characters.
        """

+        # Remove characters that are meaningful in a regex
        transTable = string.maketrans("", "")
+        result = string.translate(pat, transTable,
+                                  r'()&|!@#$%^{}\<>.')
        
-        # First, deal with mutli-character globbing
-        result = string.replace(pat, '*', '.*')
+        # First, deal with multi-character globbing
+        result = string.replace(result, '*', '.*')

        # Next, we need to deal with single-character globbing
-        result = string.replace(result, '?', '.?')
-
-        # Now, we need to remove all of the characters that
-        # are forbidden.
-        result = string.translate(result, transTable,
-                                  r'()&|!@#$%^{}\<>')
+        result = string.replace(result, '?', '.')

        return "%s$" % result 


--- a/lib/python/Products/PluginIndexes/TextIndex/TextIndex.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/TextIndex.py
--- a/lib/python/Products/PluginIndexes/TextIndex/tests/testTextIndex.py
+++ b/lib/python/Products/PluginIndexes/TextIndex/tests/testTextIndex.py
--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -267,21 +267,28 @@ class GlobbingLexicon(Lexicon):

    def query_hook(self, q):
        """expand wildcards"""
-        words = []
-        for w in q:
-            if ( (self.multi_wc in w) or
-                 (self.single_wc in w) ):
-                wids = self.get(w)
+        ListType = type([])
+        i = len(q) - 1
+        while i >= 0:
+            e = q[i]
+            if isinstance(e, ListType):
+                self.query_hook(e)
+            elif ( (self.multi_wc in e) or
+                   (self.single_wc in e) ):
+                wids = self.get(e)
+                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
-            else:
-                words.append(w)
+                if not words:
+                    # if words is empty, return something that will make
+                    # textindex's __getitem__ return an empty result list
+                    words.append('')
+                q[i] = words
+            i = i - 1

-        # if words is empty, return something that will make textindex's
-        # __getitem__ return an empty result list
-        return words or ['']
+        return q

    def Splitter(self, astring, words=None):
        """ wrap the splitter """
@@ -298,19 +305,16 @@ class GlobbingLexicon(Lexicon):
        There is no way to quote meta-characters.
        """

+        # Remove characters that are meaningful in a regex
        transTable = string.maketrans("", "")
+        result = string.translate(pat, transTable,
+                                  r'()&|!@#$%^{}\<>.')
        
-        # First, deal with mutli-character globbing
-        result = string.replace(pat, '*', '.*')
+        # First, deal with multi-character globbing
+        result = string.replace(result, '*', '.*')

        # Next, we need to deal with single-character globbing
-        result = string.replace(result, '?', '.?')
-
-        # Now, we need to remove all of the characters that
-        # are forbidden.
-        result = string.translate(result, transTable,
-                                  r'()&|!@#$%^{}\<>')
+        result = string.replace(result, '?', '.')

        return "%s$" % result 

-
--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -91,7 +91,7 @@ undo information so that objects can be unindexed when the old value
 is no longer known.
 """

-__version__ = '$Revision: 1.49 $'[11:-2]
+__version__ = '$Revision: 1.50 $'[11:-2]


 import string, re
@@ -428,7 +428,7 @@ class UnTextIndex(Persistent, Implicit):
        and a String.  Strings are looked up in the lexicon, whereas
        Integers are assumed to be resolved word ids. """
        
-        if type(word) is IntType:
+        if isinstance(word, IntType):
            # We have a word ID
            result = self._index.get(word, {})
            return ResultList(result, (word,), self)
@@ -440,7 +440,7 @@ class UnTextIndex(Persistent, Implicit):
        
            if len(splitSource) == 1:
                splitSource = splitSource[0]
-                if splitSource[:1] == '"' and splitSource[-1:] == '"':
+                if splitSource[:1] == splitSource[-1:] == '"':
                    return self[splitSource]

                wids=self.getLexicon(self._lexicon).get(splitSource)
@@ -551,28 +551,37 @@ class UnTextIndex(Persistent, Implicit):



-    def query(self, s, default_operator=Or, ws=(string.whitespace,)):
-        """ This is called by TextIndexes.  A 'query term' which is a
-        string 's' is passed in, along with an index object.  s is
-        parsed, then the wildcards are parsed, then something is
-        parsed again, then the whole thing is 'evaluated'. """
+    def query(self, s, default_operator=Or):
+        """ Evaluate a query string.
+        
+        Convert the query string into a data structure of nested lists
+        and strings, based on the grouping of whitespace-separated
+        strings by parentheses and quotes.  The 'Near' operator is
+        inserted between the strings of a quoted group.
+
+        The Lexicon is given the opportunity to transform the
+        data structure.  Stemming, wildcards, and translation are
+        possible Lexicon services.
+
+        Finally, the query list is normalized so that it and every
+        sub-list consist of non-operator strings or lists separated
+        by operators. This list is evaluated.
+        """

        # First replace any occurences of " and not " with " andnot "
-        s = re.sub(
-            '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
-            ' andnot ', s)
+        s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)

-        # do some parsing
+        # Parse parentheses and quotes
        q = parse(s)

-        ## here, we give lexicons a chance to transform the query.
-        ## For example, substitute wildcards, or translate words into
-        ## various languages.
+        # Allow the Lexicon to process the query
        q = self.getLexicon(self._lexicon).query_hook(q)
-        # do some more parsing
+
+        # Insert the default operator between any two search terms not
+        # already joined by an operator.
        q = parse2(q, default_operator)

-        ## evalute the final 'expression'
+        # evalute the final 'expression'
        return self.evaluate(q)


@@ -605,19 +614,17 @@ class UnTextIndex(Persistent, Implicit):

    def evaluate(self, query):
        """Evaluate a parsed query"""
-        # There are two options if the query passed in is only one
-        # item. It means either it's an embedded query, in which case
-        # we'll recursively evaluate, other wise it's nothing for us
-        # to evaluate, and we just get the results and return them.
-        if (len(query) == 1):
-            if (type(query[0]) is ListType):
-                return self.evaluate(query[0])
+        # Strip off meaningless layers
+        while isinstance(query, ListType) and len(query) == 1:
+            query = query[0]

-            return self[query[0]]       # __getitem__
+        # If it's not a list, assume a string or number
+        if not isinstance(query, ListType):
+            return self[query]

-        # Now we need to loop through the query and expand out
+        # Now we need to loop through the query and reduce
        # operators.  They are currently evaluated in the following
-        # order: AndNote -> And -> Or -> Near
+        # order: AndNot -> And -> Or -> Near
        i = 0
        while (i < len(query)):
            if query[i] is AndNot:
@@ -660,98 +667,91 @@ def parse(s):
    l = []
    tmp = string.lower(s)

-    while (1):
-        p = parens(tmp)
+    p = parens(tmp)
+    while p is not None:
+        # Look for quotes in the section of the string before
+        # the parentheses, then parse the string inside the parens
+        l = l + quotes(p[0])
+        l.append(parse(p[1]))

-        if (p is None):
-            # No parentheses found.  Look for quotes then exit.
-            l = l + quotes(tmp)
-            break
-        else:
-            # Look for quotes in the section of the string before
-            # the parentheses, then parse the string inside the parens
-            l = l + quotes(tmp[:(p[0] - 1)])
-            l.append(parse(tmp[p[0] : p[1]]))
-
-            # continue looking through the rest of the string
-            tmp = tmp[(p[1] + 1):]
+        # continue looking through the rest of the string
+        tmp = p[2]
+        p = parens(tmp)

-    return l
+    return l + quotes(tmp)

 def parse2(q, default_operator,
           operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
    """Find operators and operands"""
-    i = 0
    isop = operator_dict.has_key
-    while (i < len(q)):
-        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
-
-        # every other item, starting with the first, should be an operand
-        if ((i % 2) != 0):
-            # This word should be an operator; if it is not, splice in
-            # the default operator.
-            
-            if type(q[i]) is not ListType and isop(q[i]):
-                q[i] = operator_dict[q[i]]
-            else: q[i : i] = [ default_operator ]
-
-        i = i + 1
+    i = len(q) - 1
+    while i >= 0:
+        e = q[i]
+        if isinstance(e, ListType):
+            q[i] = parse2(e, default_operator)
+            if i % 2:
+                q.insert(i, default_operator)
+        elif i % 2:
+            # This element should be an operator
+            if isop(e):
+                # Ensure that it is identical, not merely equal.
+                q[i] = operator_dict[e]
+            else:
+                # Insert the default operator.
+                q.insert(i, default_operator)
+        i = i - 1

    return q


-def parens(s, parens_re=re.compile('[\(\)]').search):
-
-    index = open_index = paren_count = 0
-
-    while 1:
-
-        mo = parens_re(s, index)
-        if mo is None : break
-
+def parens(s, parens_re=re.compile('[()]').search):
+    mo = parens_re(s)
+    if mo is None:
+        return
+    
+    open_index = mo.start(0) + 1
+    paren_count = 0
+    while mo is not None:
        index = mo.start(0)
    
        if s[index] == '(':
            paren_count = paren_count + 1
-            if open_index == 0 : open_index = index + 1
        else:
            paren_count = paren_count - 1
+            if paren_count == 0:
+                return (s[:open_index - 1], s[open_index:index],
+                        s[index + 1:])
+            if paren_count < 0:
+                break
+        mo = parens_re(s, index + 1)

-        if paren_count == 0:
-            return open_index, index
-        else:
-            index = index + 1
-
-    if paren_count == 0: # No parentheses Found
-        return None
-    else:
-        raise QueryError, "Mismatched parentheses"      
-
+    raise QueryError, "Mismatched parentheses"      

-def quotes(s, ws=(string.whitespace,)):
-     # split up quoted regions
-     splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
-     split=string.split

-     if (len(splitted) > 1):
-         if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
+def quotes(s):
+    split=string.split
+    if '"' not in s:
+        return split(s)
    
-         for i in range(1,len(splitted),2):
-             # split the quoted region into words
-             splitted[i] = filter(None, split(splitted[i]))
-
-             # put the Proxmity operator in between quoted words
-             for j in range(1, len(splitted[i])):
-                 splitted[i][j : j] = [ Near ]
-
-         for i in range(len(splitted)-1,-1,-2):
-             # split the non-quoted region into words
-             splitted[i:i+1] = filter(None, split(splitted[i]))
-
-         splitted = filter(None, splitted)
-     else:
-         # No quotes, so just split the string into words
-         splitted = filter(None, split(s))
-
-     return splitted
+    # split up quoted regions
+    splitted = re.split('\s*\"\s*', s)

+    if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
+    
+    for i in range(1,len(splitted),2):
+        # split the quoted region into words
+        words = splitted[i] = split(splitted[i])
+        
+        # put the Proxmity operator in between quoted words
+        j = len(words) - 1
+        while j > 0:
+            words.insert(j, Near)
+            j = j - 1
+
+    i = len(splitted) - 1
+    while i >= 0:
+        # split the non-quoted region into words
+        splitted[i:i+1] = split(splitted[i])
+        i = i - 2
+
+    return filter(None, splitted)
--- a/lib/python/SearchIndex/tests/testUnTextIndex.py
+++ b/lib/python/SearchIndex/tests/testUnTextIndex.py
@@ -217,8 +217,8 @@ class Tests(unittest.TestCase):
       """This license has been certified as Open Source(tm).""",
       """I hope I get to work on time""",
       ]
-       
-   def checkGlobQuery(self):
+
+   def globTest(self, qmap, rlist):
       "Check a glob query"
       index=self.dbopen()
       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
@@ -232,162 +232,61 @@ class Tests(unittest.TestCase):

       index=self.dbopen()

-       r = index._apply_index({'text':'m*n'})
-       r=list(r[0].keys())
-       assert  r == [0,2], r
+       r = list(index._apply_index(qmap)[0].keys())
+       assert  r == rlist, r
+       return index._apply_index
+       
+   def checkStarQuery(self):
+       "Check a star query"
+       self.globTest({'text':'m*n'}, [0,2])

   def checkAndQuery(self):
       "Check an AND query"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'time and country'})
-       r=list(r[0].keys())
-       assert  r == [0,], r
+       self.globTest({'text':'time and country'}, [0,])

   def checkOrQuery(self):
       "Check an OR query"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
+       self.globTest({'text':'time or country'}, [0,1,6])

-       index=self.dbopen()
-
-       r = index._apply_index({'text':'time or country'})
-       r=list(r[0].keys())
-       assert  r == [0,1,6], r
+   def checkDefOrQuery(self):
+       "Check a default OR query"
+       self.globTest({'text':'time country'}, [0,1,6])

   def checkNearQuery(self):
-       """Check a NEAR query.. (NOTE:ACTUALLY AN 'OR' TEST!!)"""
-       # NEAR never worked, so Zopes post-2.3.1b3 define near to mean OR
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
+       """Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)"""
+       # NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
+       self.globTest({'text':'time ... country'}, [0,])

-       self.dbclose()
-
-       index=self.dbopen()
+   def checkQuotesQuery(self):
+       """Check a quoted query"""
+       ai = self.globTest({'text':'"This is the time"'}, [0,])

-       r = index._apply_index({'text':'time near country'})
-       r=list(r[0].keys())
-       assert  r == [0,1,6], r
+       r = list(ai({'text':'"now is the time"'})[0].keys())
+       assert  r == [], r

   def checkAndNotQuery(self):
       "Check an ANDNOT query"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'time and not country'})
-       r=list(r[0].keys())
-       assert  r == [6], r
+       self.globTest({'text':'time and not country'}, [6,])

   def checkParenMatchingQuery(self):
       "Check a query with parens"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
+       ai = self.globTest({'text':'(time and country) men'}, [0,])

-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'(time and country) men'})
-       r=list(r[0].keys())
-       assert  r == [0], r
-
-       r = index._apply_index({'text':'(time and not country) or men'})
-       r=list(r[0].keys())
+       r = list(ai({'text':'(time and not country) or men'})[0].keys())
       assert  r == [0, 6], r

-   def checkQuoteMatchingQuery(self):
-       "Check a query with quotes.. this is known to fail under 2.3.1b3-"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'"This is the time"'})
-       r=list(r[0].keys())
-       assert  r == [0], r
-
-       r = index._apply_index({'text':'"now is the time"'})
-       r=list(r[0].keys())
-       assert  r == [], r
-
   def checkTextIndexOperatorQuery(self):
       "Check a query with 'textindex_operator' in the request"
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'time men','textindex_operator':'and'})
-       r=list(r[0].keys())
-       assert  r == [0], r
+       self.globTest({'text':'time men', 'textindex_operator':'and'}, [0,])

   def checkNonExistentWord(self):
       """ Check for nonexistent word """
-       index=self.dbopen()
-       index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
-
-       for i in range(len(self.sample_texts)):
-           self.doc.text=self.sample_texts[i]
-           index.index_object(i, self.doc)
-           get_transaction().commit()
-
-       self.dbclose()
-
-       index=self.dbopen()
-
-       r = index._apply_index({'text':'zop'})
-       r=list(r[0].keys())
-       assert  r == [], r
+       self.globTest({'text':'zop'}, [])
+       
+   def checkComplexQuery1(self):
+       """ Check complex query 1 """
+       self.globTest({'text':'((?ount* or get) and not wait) '
+                      '"been *ert*"'}, [0, 1, 5, 6])
       

 def test_suite():