Commit 3d88c027 authored by Evan Simpson's avatar Evan Simpson

Merge TextIndex fixes from 2.4 branch

parent 233671d4
......@@ -85,7 +85,7 @@
from Lexicon import Lexicon
import Splitter
from Products.PluginIndexes.TextIndex.TextIndex import Or
from TextIndex import Or, Op
import re, string
......@@ -147,14 +147,12 @@ class GlobbingLexicon(Lexicon):
def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
digrams = []
digrams.append(self.eow + word[0]) # Mark the beginning
for i in range(1,len(word)):
digrams.append(word[i-1:i+1])
digrams = list(word)
digrams.append(self.eow)
last = self.eow
digrams[-1] = digrams[-1] + self.eow # Mark the end
for i in range(len(digrams)):
last, digrams[i] = digrams[i], last + digrams[i]
return digrams
......@@ -269,21 +267,30 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q):
"""expand wildcards"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif isinstance(e, Op):
pass
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
else:
words.append(w)
if not words:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's
# __getitem__ return an empty result list
return words or ['']
return q
def Splitter(self, astring, words=None):
""" wrap the splitter """
......@@ -300,18 +307,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing
result = string.replace(pat, '*', '.*')
# First, deal with multi-character globbing
result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
result = string.replace(result, '?', '.')
return "%s$" % result
......
......@@ -267,21 +267,28 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q):
"""expand wildcards"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
else:
words.append(w)
if not words:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's
# __getitem__ return an empty result list
return words or ['']
return q
def Splitter(self, astring, words=None):
""" wrap the splitter """
......@@ -298,19 +305,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing
result = string.replace(pat, '*', '.*')
# First, deal with multi-character globbing
result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
result = string.replace(result, '?', '.')
return "%s$" % result
......@@ -91,7 +91,7 @@ undo information so that objects can be unindexed when the old value
is no longer known.
"""
__version__ = '$Revision: 1.49 $'[11:-2]
__version__ = '$Revision: 1.50 $'[11:-2]
import string, re
......@@ -428,7 +428,7 @@ class UnTextIndex(Persistent, Implicit):
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if type(word) is IntType:
if isinstance(word, IntType):
# We have a word ID
result = self._index.get(word, {})
return ResultList(result, (word,), self)
......@@ -440,7 +440,7 @@ class UnTextIndex(Persistent, Implicit):
if len(splitSource) == 1:
splitSource = splitSource[0]
if splitSource[:1] == '"' and splitSource[-1:] == '"':
if splitSource[:1] == splitSource[-1:] == '"':
return self[splitSource]
wids=self.getLexicon(self._lexicon).get(splitSource)
......@@ -551,28 +551,37 @@ class UnTextIndex(Persistent, Implicit):
def query(self, s, default_operator=Or, ws=(string.whitespace,)):
""" This is called by TextIndexes. A 'query term' which is a
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is
parsed again, then the whole thing is 'evaluated'. """
def query(self, s, default_operator=Or):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s = re.sub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
' andnot ', s)
s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
# do some parsing
# Parse parentheses and quotes
q = parse(s)
## here, we give lexicons a chance to transform the query.
## For example, substitute wildcards, or translate words into
## various languages.
# Allow the Lexicon to process the query
q = self.getLexicon(self._lexicon).query_hook(q)
# do some more parsing
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
## evalute the final 'expression'
# evalute the final 'expression'
return self.evaluate(q)
......@@ -605,19 +614,17 @@ class UnTextIndex(Persistent, Implicit):
def evaluate(self, query):
"""Evaluate a parsed query"""
# There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us
# to evaluate, and we just get the results and return them.
if (len(query) == 1):
if (type(query[0]) is ListType):
return self.evaluate(query[0])
# Strip off meaningless layers
while isinstance(query, ListType) and len(query) == 1:
query = query[0]
return self[query[0]] # __getitem__
# If it's not a list, assume a string or number
if not isinstance(query, ListType):
return self[query]
# Now we need to loop through the query and expand out
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNote -> And -> Or -> Near
# order: AndNot -> And -> Or -> Near
i = 0
while (i < len(query)):
if query[i] is AndNot:
......@@ -660,98 +667,91 @@ def parse(s):
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
p = parens(tmp)
while p is not None:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(p[0])
l.append(parse(p[1]))
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
# continue looking through the rest of the string
tmp = p[2]
p = parens(tmp)
return l
return l + quotes(tmp)
def parse2(q, default_operator,
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
"""Find operators and operands"""
i = 0
isop = operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
q[i] = parse2(e, default_operator)
if i % 2:
q.insert(i, default_operator)
elif i % 2:
# This element should be an operator
if isop(e):
# Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
else:
# Insert the default operator.
q.insert(i, default_operator)
i = i - 1
return q
def parens(s, parens_re=re.compile('[\(\)]').search):
index = open_index = paren_count = 0
while 1:
mo = parens_re(s, index)
if mo is None : break
def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
if mo is None:
return
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return (s[:open_index - 1], s[open_index:index],
s[index + 1:])
if paren_count < 0:
break
mo = parens_re(s, index + 1)
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
raise QueryError, "Mismatched parentheses"
def quotes(s, ws=(string.whitespace,)):
# split up quoted regions
splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
def quotes(s):
split=string.split
if '"' not in s:
return split(s)
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = split(splitted[i])
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = split(splitted[i])
i = i - 2
return filter(None, splitted)
......@@ -217,8 +217,8 @@ class Tests(unittest.TestCase):
"""This license has been certified as Open Source(tm).""",
"""I hope I get to work on time""",
]
def checkGlobQuery(self):
def globTest(self, qmap, rlist):
"Check a glob query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
......@@ -232,162 +232,61 @@ class Tests(unittest.TestCase):
index=self.dbopen()
r = index._apply_index({'text':'m*n'})
r=list(r[0].keys())
assert r == [0,2], r
r = list(index._apply_index(qmap)[0].keys())
assert r == rlist, r
return index._apply_index
def checkStarQuery(self):
"Check a star query"
self.globTest({'text':'m*n'}, [0,2])
def checkAndQuery(self):
"Check an AND query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and country'})
r=list(r[0].keys())
assert r == [0,], r
self.globTest({'text':'time and country'}, [0,])
def checkOrQuery(self):
"Check an OR query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
self.globTest({'text':'time or country'}, [0,1,6])
index=self.dbopen()
r = index._apply_index({'text':'time or country'})
r=list(r[0].keys())
assert r == [0,1,6], r
def checkDefOrQuery(self):
"Check a default OR query"
self.globTest({'text':'time country'}, [0,1,6])
def checkNearQuery(self):
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'OR' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean OR
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self.globTest({'text':'time ... country'}, [0,])
self.dbclose()
index=self.dbopen()
def checkQuotesQuery(self):
"""Check a quoted query"""
ai = self.globTest({'text':'"This is the time"'}, [0,])
r = index._apply_index({'text':'time near country'})
r=list(r[0].keys())
assert r == [0,1,6], r
r = list(ai({'text':'"now is the time"'})[0].keys())
assert r == [], r
def checkAndNotQuery(self):
"Check an ANDNOT query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and not country'})
r=list(r[0].keys())
assert r == [6], r
self.globTest({'text':'time and not country'}, [6,])
def checkParenMatchingQuery(self):
"Check a query with parens"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
ai = self.globTest({'text':'(time and country) men'}, [0,])
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'(time and country) men'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'(time and not country) or men'})
r=list(r[0].keys())
r = list(ai({'text':'(time and not country) or men'})[0].keys())
assert r == [0, 6], r
def checkQuoteMatchingQuery(self):
"Check a query with quotes.. this is known to fail under 2.3.1b3-"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'"This is the time"'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'"now is the time"'})
r=list(r[0].keys())
assert r == [], r
def checkTextIndexOperatorQuery(self):
"Check a query with 'textindex_operator' in the request"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time men','textindex_operator':'and'})
r=list(r[0].keys())
assert r == [0], r
self.globTest({'text':'time men', 'textindex_operator':'and'}, [0,])
def checkNonExistentWord(self):
""" Check for nonexistent word """
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'zop'})
r=list(r[0].keys())
assert r == [], r
self.globTest({'text':'zop'}, [])
def checkComplexQuery1(self):
""" Check complex query 1 """
self.globTest({'text':'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
def test_suite():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment