- added prototype for new TextIndex to support NEAR search and

stemmer support for eleven languages. - TextIndex code cleanup - refactoring of the existing TextIndex code

- added prototype for new TextIndex to support NEAR search and
stemmer support for eleven languages. - TextIndex code cleanup - refactoring of the existing TextIndex code
43fd91db · Andreas Jung · 22ad134e · 43fd91db · 43fd91db · 43fd91db
Commit 43fd91db authored Jan 04, 2002 by Andreas Jung
7 changed files
--- a/lib/python/Products/PluginIndexes/TextIndexNG/ResultListNG.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/ResultListNG.py
+#############################################################################
+#
+# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
+# 
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+# 
+##############################################################################
+
+from BTrees.IIBTree import IIBucket,IISet
+from BTrees.OOBTree import OOSet
+from BTrees.IIBTree import weightedIntersection,  difference
+from BTrees.IIBTree import union as Iunion
+from BTrees.IIBTree import intersection as Iintersection
+from BTrees.OOBTree import union as Ounion
+
+from types import TupleType
+from TextIndexCommon import debug
+
+
+class ResultListNG:
+    """ class to keep results for TextIndexNG queries """
+  
+    def __init__(self, d, words, index):
+
+        # usually instance of TextIndexNG
+        self._index = index
+
+        # words is either an OOSet or a mapping
+        if type(words) is not OOSet: words=OOSet(words)
+        self._words = words
+
+        self._dict = d        
+
+        self.keys    = self._dict.keys
+        self.values  = self._dict.values
+        self.items   = self._dict.items
+
+
+    def Intersection(self,d1,d2):   
+        """ Intersection between the documentIds (keys) of two dictionaries.
+            The list of positions are merged.
+        """
+
+        r = {}
+
+        docIds = Iintersection(
+                    IISet(d1.keys()),
+                    IISet(d2.keys())
+                    )
+            
+        for docId in docIds: 
+            r[docId] = Iunion( d1[docId], d2[docId])
+
+        return r
+
+
+    def Union(self,d1,d2):
+        """ Union of the documentIds (keys) of two dictionaries.
+            The list of positions are merged.
+        """
+
+        r = d1.copy()
+            
+        for docId in d2.keys():
+
+            if d1.has_key(docId):
+                r[docId] = Iunion( d1[docId], d2[docId])
+            else:
+                r[docId] = d2[docId]
+
+        return r
+            
+            
+
+    def __and__(self, x):
+        """ and """
+
+        return self.__class__(
+            self.Intersection(self._dict, x._dict),
+            Ounion(self._words, x._words),
+            self._index,
+            )
+
+
+    def and_not(self, x):
+        """ and not  """
+
+        return self.__class__(
+            difference(self._dict, x._dict),
+            self._words,
+            self._index,
+            )
+  
+    def __or__(self, x):
+        """ or """
+
+        return self.__class__(
+            self.Union(self._dict, x._dict),
+            Ounion(self._words, x._words),
+            self._index,
+            )
+
+
+    def near(self, x):
+        """ near search """
+
+        debug('-'*78)
+        debug('entering near:')
+        debug(self._words)
+        debug(x._words)
+
+
+        result = IIBucket()
+
+        dict  = self._dict
+        xdict = x._dict
+
+        positions = self._index.positions
+
+        debug("applying near search for documents:")
+        debug("\t",dict)
+        debug("\t",xdict)
+
+        # inters is an IISet() with documentIds.
+        
+        inters = self.Intersection(dict, xdict)
+
+        debug("Intersection is:")
+        debug('\t',inters)
+
+        for docId in inters.keys():
+
+            debug('searching for positions',docId,self._words)
+            p1 = positions(docId, self._words)
+           
+            debug('searching for positions',docId,x._words)
+            p2 = positions(docId, x._words)
+
+            leftPositions = IISet() 
+            for set in p1.values():
+                leftPositions = Iunion(leftPositions,set)
+
+            rightPositions = IISet() 
+            for set in p2.values():
+                rightPositions = Iunion(rightPositions,set)
+
+
+            for pl in leftPositions:
+                for pr in rightPositions:
+                    diff = abs(pl - pr)
+                
+                    if diff < 4:
+                        debug('difference for (%d,%d): %d' % (pl,pr,diff))
+                        result[docId] = 0
+
+
+        return self.__class__(
+            result, Ounion(self._words, x._words), self._index)
--- a/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexCommon.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexCommon.py
+##############################################################################
+# 
+# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+# 
+#############################################################################
+
+
+import re
+from TextOperators import operator_dict, Near
+from types import ListType
+
+_debug = 1
+
+
+def parse(s):
+    """Parse parentheses and quotes"""
+
+    l = []
+    tmp = s.lower()
+
+    p = parens(tmp)
+
+    while p is not None:
+
+        # Look for quotes in the section of the string before
+        # the parentheses, then parse the string inside the parens
+
+        l = l + quotes(p[0])
+        l.append(parse(p[1]))
+
+        # continue looking through the rest of the string
+
+        tmp = p[2]
+        p = parens(tmp)
+
+    return l + quotes(tmp)
+
+
+
+def parse2(q, default_operator, operator_dict=operator_dict):
+    """Find operators and operands"""
+
+    isop = operator_dict.has_key
+    i = 0
+
+    while i < len(q):
+        e = q[i]
+
+        if isinstance(e, ListType):
+            q[i] = parse2(e, default_operator)
+            if i % 2:
+                q.insert(i, default_operator)
+                i+=1 
+
+        elif i % 2:
+
+            # This element should be an operator
+
+            if isop(e):
+
+                # Ensure that it is identical, not merely equal.
+                q[i] = operator_dict[e]
+
+            else:
+
+                # Insert the default operator.
+                q.insert(i, default_operator)
+                i+=1 
+
+        i+=1 
+
+    return q
+
+
+def parens(s, parens_re=re.compile('[()]').search):
+
+    mo = parens_re(s)
+    if mo is None: return
+    
+    open_index = mo.start(0) + 1
+    paren_count = 0
+
+    while mo is not None:
+        index = mo.start(0)
+    
+        if s[index] == '(':
+            paren_count = paren_count + 1
+
+        else:
+            paren_count = paren_count - 1
+            if paren_count == 0:
+                return (s[:open_index - 1], s[open_index:index],
+                        s[index + 1:])
+            if paren_count < 0:
+                break
+        mo = parens_re(s, index + 1)
+
+    raise QueryError, "Mismatched parentheses"      
+
+
+def quotes(s):
+
+    if '"' not in s: return s.split()
+    
+    # split up quoted regions
+    splitted = re.split('\s*\"\s*', s)
+
+    if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
+    
+    for i in range(1,len(splitted),2):
+
+        # split the quoted region into words
+        words = splitted[i] = splitted[i].split()
+        
+        # put the Proxmity operator in between quoted words
+        j = len(words) - 1
+        while j > 0:
+            words.insert(j, Near)
+            j = j - 1
+
+    i = len(splitted) - 1
+    while i >= 0:
+        # split the non-quoted region into words
+        splitted[i:i+1] = splitted[i].split()
+        i = i - 2
+
+    return filter(None, splitted)
+
+
+def debug(*args):
+    """ used by TextIndexNG for dev. purposes """
+
+    import sys
+
+    if _debug:
+
+        for a in args:
+            sys.stdout.write(str(a))
+
+        sys.stdout.write('\n')
+        sys.stdout.flush()
--- a/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py
--- a/lib/python/Products/PluginIndexes/TextIndexNG/TextOperators.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/TextOperators.py
+##############################################################################
+# 
+# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+# 
+#############################################################################
+
+
+class Op:
+    """ TextIndex operator class """
+
+    def __init__(self, name):
+        self.name = name
+
+    def __repr__(self):
+        return self.name
+    __str__ = __repr__
+
+
+
+AndNot      = Op('andnot')
+And         = Op('and')
+Or          = Op('or')
+Near        = Op('...')
+
+operator_dict = {
+    'andnot': AndNot, 
+    'and':    And, 
+    'or':     Or,
+    '...':    Near, 
+    'near':   Near,
+    AndNot:   AndNot, 
+    And:      And, 
+    Or:       Or, 
+    Near:     Near
+}
+
+QueryError    = 'TextIndex.QueryError'
--- a/lib/python/Products/PluginIndexes/TextIndexNG/__init__.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/__init__.py
--- a/lib/python/Products/PluginIndexes/TextIndexNG/test.py
+++ b/lib/python/Products/PluginIndexes/TextIndexNG/test.py
+#!/usr/bin/env python2.1
+
+from Products.PluginIndexes.TextIndexNG import TextIndexNG
+import os, sys, re,traceback, atexit
+import readline
+
+histfile = os.path.expanduser('~/.pyhist')
+try:
+    readline.read_history_file(histfile)
+except IOError: pass
+atexit.register(readline.write_history_file,histfile)
+
+datadir = '/work/html//doc/python-2.2/lib'
+datadir = '/work/html//doc/python-2.2/ext'
+
+class extra: pass
+
+
+class TO:
+    
+    def __init__(self,txt):
+        self.text = txt
+
+
+ex = extra()
+ex.useSplitter='ZopeSplitter'
+ex.useStemmer='porter'
+ex.useOperator='and'
+ex.lexicon = None
+ex.useGlobbing=1
+
+
+TI = TextIndexNG.TextIndexNG('text',ex)
+t1 = TO ('this text is a suxing text')
+t2 = TO ('the suxing quick brown fox jumps over the lazy dog because the dog is quick and jumps quick') 
+TI.index_object(-1,t1)
+TI.index_object(-2,t2)
+
+files = os.listdir(datadir)
+files.sort()
+
+for i in range(len(files)):
+    f = files[i]
+    print f
+    fname = os.path.join(datadir,f)
+    data = open(fname).read()
+
+    T = TO(data)
+    TI.index_object(i,T)
+
+
+
+#TI.newTree()
+#print 
+#print TI._apply_index({'text':{'query':'suxing'}})
+#print 
+#print TI._apply_index({'text':{'query':'blabla'}})
+#print 
+#print TI._apply_index({'text':{'query':'suxing and quick'}})
+##print TI._apply_index({'text':{'query':'(wurm dog and cat) or dummnase'}})
+##print TI._apply_index({'text':{'query':'("wurm dog blabla the" and cat) or dummnase'}})
+#print 
+#x = TI._apply_index({'text':{'query':'dog and lazy'}})
+#print x
+
+
+while 1:
+
+    line = raw_input("> ")
+    
+    
+    try:
+        nums,dummy = TI._apply_index({'text':{'query':line}})
+
+        print "Result:"
+        
+        for k,v in nums.items():
+            print k,files[k]
+
+        print 
+
+    except:
+        traceback.print_exc()
+    
--- a/lib/python/Products/PluginIndexes/__init__.py
+++ b/lib/python/Products/PluginIndexes/__init__.py
@@ -19,8 +19,9 @@ import PathIndex.PathIndex
 import TextIndex.TextIndex 
 import FieldIndex.FieldIndex
 import KeywordIndex.KeywordIndex
+import TextIndexNG.TextIndexNG

-_indexes =  ['TextIndex','KeywordIndex','FieldIndex','PathIndex']
+_indexes =  ['TextIndex','TextIndexNG','KeywordIndex','FieldIndex','PathIndex']

 def initialize(context):