Commit 39eb5d7a authored by chris's avatar chris

indexed data now stored as dictionaries rather than ResultLists.

indexing documents with few than two keywords fails silently rather
than raising an exception.
parent 144c0bc0
...@@ -30,7 +30,7 @@ Example usage: ...@@ -30,7 +30,7 @@ Example usage:
print i['blah'] print i['blah']
$Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' $Id: InvertedIndex.py,v 1.25 1997/03/28 16:53:50 chris Exp $'''
# Copyright # Copyright
# #
# Copyright 1996 Digital Creations, L.C., 910 Princess Anne # Copyright 1996 Digital Creations, L.C., 910 Princess Anne
...@@ -82,6 +82,11 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' ...@@ -82,6 +82,11 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $'''
# (540) 371-6909 # (540) 371-6909
# #
# $Log: InvertedIndex.py,v $ # $Log: InvertedIndex.py,v $
# Revision 1.25 1997/03/28 16:53:50 chris
# indexed data now stored as dictionaries rather than ResultLists.
# indexing documents with few than two keywords fails silently rather
# than raising an exception.
#
# Revision 1.24 1997/03/24 20:22:27 chris # Revision 1.24 1997/03/24 20:22:27 chris
# *** empty log message *** # *** empty log message ***
# #
...@@ -167,7 +172,7 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $''' ...@@ -167,7 +172,7 @@ $Id: InvertedIndex.py,v 1.24 1997/03/24 20:22:27 chris Exp $'''
# #
# #
# #
__version__='$Revision: 1.24 $'[11:-2] __version__='$Revision: 1.25 $'[11:-2]
import regex, regsub, string, copy import regex, regsub, string, copy
...@@ -293,14 +298,16 @@ class ResultList: ...@@ -293,14 +298,16 @@ class ResultList:
for key,v in self.items(): for key,v in self.items():
try: try:
xv=x[key] xv = x[key]
v=v[0]+xv[0], v[1]+xv[1] v = v[0] + xv[0], v[1] + xv[1]
except: pass except: pass
result[key] = v result[key] = v
for key,v in x.items(): for key,v in x.items():
try: self[key] try:
except: result[key]=v self[key]
except:
result[key]=v
return result return result
...@@ -383,16 +390,18 @@ RegexType = type(regex.compile('')) ...@@ -383,16 +390,18 @@ RegexType = type(regex.compile(''))
IndexingError = 'InvertedIndex.IndexingError' IndexingError = 'InvertedIndex.IndexingError'
_default_stop_words = [ _default_stop_words = [
'also', 'an', 'and', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
'can', 'cannot', 'did', 'do', 'doing', 'either', 'else', 'even', 'for', 'because', 'been', 'being', 'but', 'by', 'can', 'cannot', 'did', 'do',
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'doing', 'each', 'either', 'else', 'even', 'for', 'from', 'get', 'got',
'him', 'himself', 'his', 'if', 'in', 'it', 'its', 'me', 'my', 'myself', 'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'him', 'himself',
'no', 'not', 'of', 'on', 'only', 'onto', 'or', 'our', 'ourselves', 'she', 'so', 'some', 'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'me', 'my', 'myself',
'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'there', 'no', 'not', 'of', 'on', 'one', 'only', 'onto', 'or', 'our', 'ourselves',
'these', 'they', 'this', 'those', 'to', 'too', 'unless', 'until', 'us', 'she', 'since', 'so', 'some', 'take', 'than', 'that', 'the', 'their', 'them',
'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
'who', 'whoever', 'whom', 'whomever', 'whose', 'why', 'with', 'without', 'to', 'too', 'unless', 'until', 'upon', 'us', 'very', 'was', 'we', 'were',
'would', 'yes', 'your', 'yours', 'yourself', 'yourselves', 'what', 'when', 'where', 'which', 'while', 'who', 'whoever', 'whom',
'whomever', 'whose', 'why', 'will', 'with', 'without', 'would', 'yes',
'you', 'your', 'yours', 'yourself', 'yourselves',
] ]
default_stop_words = {} default_stop_words = {}
...@@ -448,13 +457,8 @@ class Index: ...@@ -448,13 +457,8 @@ class Index:
print i['blah'] print i['blah']
''' '''
list_class = ResultList def __init__(self, index_dictionary = None)
def __init__(self, index_dictionary = None, list_class = None):
'Create an inverted index' 'Create an inverted index'
if (list_class is not None):
self.list_class = list_class
if (index_dictionary is None): if (index_dictionary is None):
index_dictionary = copy.copy(default_stop_words) index_dictionary = copy.copy(default_stop_words)
...@@ -495,7 +499,7 @@ class Index: ...@@ -495,7 +499,7 @@ class Index:
src = map(lower,filter(None, self.split_words(src))) src = map(lower,filter(None, self.split_words(src)))
if (len(src) < 2): if (len(src) < 2):
raise IndexingError, 'cannot index document with fewer than two keywords' return
nwords = math.log(len(src)) nwords = math.log(len(src))
...@@ -525,18 +529,20 @@ class Index: ...@@ -525,18 +529,20 @@ class Index:
except KeyError: except KeyError:
d[s] = [ i ] d[s] = [ i ]
addentry=self.addentry addentry = self.addentry
for word,positions in d.items(): for word, positions in d.items():
freq = int(10000 * (len(positions) / nwords)) freq = int(10000 * (len(positions) / nwords))
addentry(word,srckey,(freq, positions)) addentry(word,srckey,(freq, positions))
def addentry(self,word,key,data): def addentry(self,word,key,data):
index=self._index_object index = self._index_object
try: rl=index[word] try:
rl = index[word]
except: except:
rl=self.list_class() rl = self.list_class()
index[word]=rl index[word] = {}
rl[key]=data
rl[key] = data
def __getitem__(self, key): def __getitem__(self, key):
'''\ '''\
...@@ -550,7 +556,7 @@ class Index: ...@@ -550,7 +556,7 @@ class Index:
''' '''
index = self._index_object index = self._index_object
List = self.list_class List = ResultList
if (type(key) == RegexType): if (type(key) == RegexType):
dict = {} dict = {}
...@@ -585,7 +591,7 @@ class Index: ...@@ -585,7 +591,7 @@ class Index:
if (key is None): if (key is None):
return List() return List()
return key return List(key)
def keys(self): def keys(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment