Commit 43fd91db authored by Andreas Jung's avatar Andreas Jung

- added prototype for new TextIndex to support NEAR search and

  stemmer support for eleven languages.
- TextIndex code cleanup
- refactoring of the existing TextIndex code
parent 22ad134e
#############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from BTrees.IIBTree import IIBucket,IISet
from BTrees.OOBTree import OOSet
from BTrees.IIBTree import weightedIntersection, difference
from BTrees.IIBTree import union as Iunion
from BTrees.IIBTree import intersection as Iintersection
from BTrees.OOBTree import union as Ounion
from types import TupleType
from TextIndexCommon import debug
class ResultListNG:
""" class to keep results for TextIndexNG queries """
def __init__(self, d, words, index):
# usually instance of TextIndexNG
self._index = index
# words is either an OOSet or a mapping
if type(words) is not OOSet: words=OOSet(words)
self._words = words
self._dict = d
self.keys = self._dict.keys
self.values = self._dict.values
self.items = self._dict.items
def Intersection(self,d1,d2):
""" Intersection between the documentIds (keys) of two dictionaries.
The list of positions are merged.
"""
r = {}
docIds = Iintersection(
IISet(d1.keys()),
IISet(d2.keys())
)
for docId in docIds:
r[docId] = Iunion( d1[docId], d2[docId])
return r
def Union(self,d1,d2):
""" Union of the documentIds (keys) of two dictionaries.
The list of positions are merged.
"""
r = d1.copy()
for docId in d2.keys():
if d1.has_key(docId):
r[docId] = Iunion( d1[docId], d2[docId])
else:
r[docId] = d2[docId]
return r
def __and__(self, x):
""" and """
return self.__class__(
self.Intersection(self._dict, x._dict),
Ounion(self._words, x._words),
self._index,
)
def and_not(self, x):
""" and not """
return self.__class__(
difference(self._dict, x._dict),
self._words,
self._index,
)
def __or__(self, x):
""" or """
return self.__class__(
self.Union(self._dict, x._dict),
Ounion(self._words, x._words),
self._index,
)
def near(self, x):
""" near search """
debug('-'*78)
debug('entering near:')
debug(self._words)
debug(x._words)
result = IIBucket()
dict = self._dict
xdict = x._dict
positions = self._index.positions
debug("applying near search for documents:")
debug("\t",dict)
debug("\t",xdict)
# inters is an IISet() with documentIds.
inters = self.Intersection(dict, xdict)
debug("Intersection is:")
debug('\t',inters)
for docId in inters.keys():
debug('searching for positions',docId,self._words)
p1 = positions(docId, self._words)
debug('searching for positions',docId,x._words)
p2 = positions(docId, x._words)
leftPositions = IISet()
for set in p1.values():
leftPositions = Iunion(leftPositions,set)
rightPositions = IISet()
for set in p2.values():
rightPositions = Iunion(rightPositions,set)
for pl in leftPositions:
for pr in rightPositions:
diff = abs(pl - pr)
if diff < 4:
debug('difference for (%d,%d): %d' % (pl,pr,diff))
result[docId] = 0
return self.__class__(
result, Ounion(self._words, x._words), self._index)
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import re
from TextOperators import operator_dict, Near
from types import ListType
_debug = 1
def parse(s):
"""Parse parentheses and quotes"""
l = []
tmp = s.lower()
p = parens(tmp)
while p is not None:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(p[0])
l.append(parse(p[1]))
# continue looking through the rest of the string
tmp = p[2]
p = parens(tmp)
return l + quotes(tmp)
def parse2(q, default_operator, operator_dict=operator_dict):
"""Find operators and operands"""
isop = operator_dict.has_key
i = 0
while i < len(q):
e = q[i]
if isinstance(e, ListType):
q[i] = parse2(e, default_operator)
if i % 2:
q.insert(i, default_operator)
i+=1
elif i % 2:
# This element should be an operator
if isop(e):
# Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
else:
# Insert the default operator.
q.insert(i, default_operator)
i+=1
i+=1
return q
def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
if mo is None: return
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return (s[:open_index - 1], s[open_index:index],
s[index + 1:])
if paren_count < 0:
break
mo = parens_re(s, index + 1)
raise QueryError, "Mismatched parentheses"
def quotes(s):
if '"' not in s: return s.split()
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = splitted[i].split()
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = splitted[i].split()
i = i - 2
return filter(None, splitted)
def debug(*args):
""" used by TextIndexNG for dev. purposes """
import sys
if _debug:
for a in args:
sys.stdout.write(str(a))
sys.stdout.write('\n')
sys.stdout.flush()
This diff is collapsed.
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
class Op:
""" TextIndex operator class """
def __init__(self, name):
self.name = name
def __repr__(self):
return self.name
__str__ = __repr__
AndNot = Op('andnot')
And = Op('and')
Or = Op('or')
Near = Op('...')
operator_dict = {
'andnot': AndNot,
'and': And,
'or': Or,
'...': Near,
'near': Near,
AndNot: AndNot,
And: And,
Or: Or,
Near: Near
}
QueryError = 'TextIndex.QueryError'
#!/usr/bin/env python2.1
from Products.PluginIndexes.TextIndexNG import TextIndexNG
import os, sys, re,traceback, atexit
import readline
histfile = os.path.expanduser('~/.pyhist')
try:
readline.read_history_file(histfile)
except IOError: pass
atexit.register(readline.write_history_file,histfile)
datadir = '/work/html//doc/python-2.2/lib'
datadir = '/work/html//doc/python-2.2/ext'
class extra: pass
class TO:
def __init__(self,txt):
self.text = txt
ex = extra()
ex.useSplitter='ZopeSplitter'
ex.useStemmer='porter'
ex.useOperator='and'
ex.lexicon = None
ex.useGlobbing=1
TI = TextIndexNG.TextIndexNG('text',ex)
t1 = TO ('this text is a suxing text')
t2 = TO ('the suxing quick brown fox jumps over the lazy dog because the dog is quick and jumps quick')
TI.index_object(-1,t1)
TI.index_object(-2,t2)
files = os.listdir(datadir)
files.sort()
for i in range(len(files)):
f = files[i]
print f
fname = os.path.join(datadir,f)
data = open(fname).read()
T = TO(data)
TI.index_object(i,T)
#TI.newTree()
#print
#print TI._apply_index({'text':{'query':'suxing'}})
#print
#print TI._apply_index({'text':{'query':'blabla'}})
#print
#print TI._apply_index({'text':{'query':'suxing and quick'}})
##print TI._apply_index({'text':{'query':'(wurm dog and cat) or dummnase'}})
##print TI._apply_index({'text':{'query':'("wurm dog blabla the" and cat) or dummnase'}})
#print
#x = TI._apply_index({'text':{'query':'dog and lazy'}})
#print x
while 1:
line = raw_input("> ")
try:
nums,dummy = TI._apply_index({'text':{'query':line}})
print "Result:"
for k,v in nums.items():
print k,files[k]
print
except:
traceback.print_exc()
......@@ -19,8 +19,9 @@ import PathIndex.PathIndex
import TextIndex.TextIndex
import FieldIndex.FieldIndex
import KeywordIndex.KeywordIndex
import TextIndexNG.TextIndexNG
_indexes = ['TextIndex','KeywordIndex','FieldIndex','PathIndex']
_indexes = ['TextIndex','TextIndexNG','KeywordIndex','FieldIndex','PathIndex']
def initialize(context):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment