[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.51
Andreas Jung
andreas@digicool.com
Mon, 11 Mar 2002 18:48:39 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv12730
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
refactored forward and reverse indexes in seperate module
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.50 => 1.2.2.51 ===
from Products.PluginIndexes.common.util import parseIndexRequest
from OFS.content_types import guess_content_type
-from BTrees.IOBTree import IOBTree
-from BTrees.OOBTree import OOBTree
-from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import IIBucket, IISet
from BTrees.IIBTree import intersection as IntIntersection, union as IntUnion
from BTrees.OOBTree import intersection as ObjIntersection, union as ObjUnion
from BTrees.IIBTree import weightedIntersection
@@ -44,6 +42,7 @@
from queryparser.queryparser import QueryParser
from DumbQueryParser import DumbQueryParser
+from WordIdDocumentIdStorage import WordIdDocumentIdStorage
import Stemmer, Similarity
import Thesaurus, StopWords, Normalizer
@@ -51,7 +50,6 @@
import TextIndexCommon
import ConverterRegistry
-
class Timer:
def __init__(self, verbose=1):
@@ -210,10 +208,9 @@
def clear(self):
- self.__OBJECTS = IOBTree() # mapping RID->object reference
+ self.WordDocStorage = WordIdDocumentIdStorage()
+ self.SimilarityWordDocStorage = WordIdDocumentIdStorage()
- self._IDX = IOBTree()
- self._invIDX = IOBTree()
self._thesaurus = None
self._stopwords = None
@@ -250,9 +247,7 @@
# Similarity
if self.useSimilarity:
- self._PROX_LEX = SimilarityLexicon(algorithm=self.useSimilarity)
- self._PROX_IDX = IOBTree()
- self._invPROX_IDX = IOBTree()
+ self._SIMILARITY_LEXICON = SimilarityLexicon(algorithm=self.useSimilarity)
# the selection of the Similarity function must be more general
# in the future. This requires some more work on the Python
@@ -302,75 +297,8 @@
return Stemmer.availableStemmers()
- def insertBackwardEntries(self,widLst, documentId):
- """ insert a list of wordIds for the given documentId
- into the backward index
- """
-
- idx = self._invIDX
-
- try:
- idx[documentId].update(widLst)
- except:
- idx[documentId] = IISet(widLst)
-
-
-
- def insertForwardEntries(self,wordIds,documentId):
- """ insert entries for forward index. This function does not store
- word positions. Word positions are calculated when document is in the
- hitlist.
-
- wordId is either an integer or a list of integers
- """
-
- # self._IDX is a mapping:
- # wordId -> documentId
-
- idx = self._IDX
-
- if isinstance(wordIds,IntType): wordIds = [wordIds]
-
- for wordId in wordIds:
-
- try:
- idx[wordId].insert(documentId)
- except:
- idx[wordId] = IISet()
- idx[wordId].insert(documentId)
-
-
- def insertSimilarityEntries(self,wordIds,documentId):
- """ insert forward *and* backword entries for Similarity indexes """
-
- idx = self._PROX_IDX
- invidx = self._invPROX_IDX
-
- if isinstance(wordIds,IntType): wordIds = [wordIds]
-
- for wordId in wordIds:
-
- if idx.has_key(wordId) == 0:
- idx[wordId] = IISet()
-
- idx[wordId].insert(documentId)
-
-
- if invidx.has_key(documentId)==0:
- invidx[documentId] = IISet(wordIds)
- else:
- invidx[documentId].update(wordIds)
-
-
-
def _printIndex(self):
-
- for wordId in self._IDX.keys():
- print '-'*78
- print wordId,self._v_getWordById(wordId),
- print
- for k,v in self._IDX[wordId].items():
- print k,v
+ print self.WordDocStorage
def index_object(self, documentId, obj, threshold=None):
@@ -437,10 +365,10 @@
if self.useSimilarity:
Similarity_words = self._v_Similarityfunc(words)
- Similarity_widList = self._PROX_LEX.getWordIdList(Similarity_words)
+ Similarity_widList = self._SIMILARITY_LEXICON.getWordIdList(Similarity_words)
assert len(Similarity_words)==len(Similarity_widList)
- self.insertSimilarityEntries(Similarity_widList,documentId)
+ self.SimilarityWordDocStorage(Similarity_widList, documentId)
T("Similarity")
@@ -470,12 +398,8 @@
T("Widlist")
# insert forward entries
- self.insertForwardEntries(widLst,documentId)
- T("ForwardEntries")
-
- # insert backward entries
- self.insertBackwardEntries(widLst,documentId)
- T("BackwardEntries")
+ self.WordDocStorage.insert(widLst, documentId)
+ T("WordDocEntries")
if self.verbose: T.printStats()
@@ -486,18 +410,9 @@
""" carefully unindex document with Id 'documentId'
index and do not fail if it does not exist
"""
+
+ self.WordDocStorage.removeDocument(documentId)
- invIDX = self._invIDX
- IDX = self._IDX
-
- for wid in invIDX[documentId]:
- IDX[wid].remove( documentId )
-
- if len(IDX[wid])==0:
- del IDX[wid]
-
- del invIDX[documentID]
-
def getLexicon(self):
return self._LEXICON
@@ -600,7 +515,7 @@
docIds = IISet()
for wid in wids:
- docIds.update( self._IDX.get(wid) )
+ docIds.update( self.WordDocStorage.get(wid) )
debug('\tDocIds: ', list(docIds.keys()))
@@ -620,7 +535,7 @@
raise TextIndexNGException, 'Similarity search is not enabled'
# Lookup list of wordIds (usually should contain only *one*)
- wids = self._PROX_LEX.get(word)
+ wids = self._SIMILARITY_LEXICON.get(word)
debug("\tWids: ", wids)
# Retrieve list of docIds for that wordId
@@ -628,10 +543,12 @@
# docIds is an IOBTree and contains the mapping
# (documentId, list of positions) for one word/wid
- docIds = self._PROX_IDX.get(wids[0])
- debug('\tDocIds: ', list(docIds.keys()))
- debug('\tPositions: ', list(docIds.values()))
+ docIds = IISet()
+ for wid in wids:
+ docIds.update ( self._SIMILARITY_INDEX.get(wid) )
+
+ debug('\tDocIds: ', docIds)
r = ResultSet( docIds, (word,))
@@ -821,7 +738,7 @@
def numObjects(self):
""" return number of index objects """
- return len(self._IDX)
+ return len(self.WordDocStorage)
def info(self):