[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.6
Andreas Jung
andreas@zope.com
Mon, 7 Jan 2002 14:18:20 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv18973
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
+ works not with ZCatalog
+ support for either internal storage of word positions or lookup
of word positions on-the-fly when document is in resultlist
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.5 => 1.2.2.6 ===
_valid_default_operators = ('and','or')
+
def __init__(self
, id
, extra= None
@@ -68,17 +69,42 @@
self.id = id
+ # reference to catalog (can we get rid of that ?)
+ self.catalog = caller
+
+ # name of splitter
self.useSplitter = getattr(extra,'useSplitter', 'ZopeSplitter')
+
+ # name of stemmer or None
self.useStemmer = getattr(extra,'useStemmer', None)
+
+ # default operator to combine queries
self.useOperator = getattr(extra,'useOperator', 'and')
+
+ # support globbing: 1/0
self.useGlobbing = getattr(extra,'useGlobbing', 1)
+
+ # lexicon to be used (name, reference or None(internal))
self.lexicon = getattr(extra,'lexicon', None)
+
+ # support near search: 1/0 (requires more storage)
self.useNearSearch = getattr(extra,'useNearSearch', 1)
+
+ # default maximum distance for words with near search
self.nearDistance = getattr(extra,'nearDistance', 5)
+
+ # use proximity algorithm
self.useProximity = getattr(extra,'useProximity', None)
- if self.lexicon == 'None': self.lexicon = None
+ # storage of positions for near search ('internal','documentLookup')
+ self.nearStorage = getattr(extra,'nearStorage', 'internal')
+
+ if self.lexicon == 'None': self.lexicon = None
if self.useStemmer == 'None': self.useStemmer = None
+
+ if not self.nearStorage in ('internal','documentLookup'):
+ raise ValueError,'nearStorage must be either "internal"'\
+ ' or "documentLookup"'
self.clear()
@@ -88,6 +114,15 @@
self._IDX = IOBTree()
self._invIDX = IOBTree()
+ # near Search
+ if self.nearStorage == 'internal':
+ self.positions = self.positionsFromInternalStorage
+ self.insertForwardEntry = self.insertForwardEntryInternal
+ else:
+ self.positions = self.positionsFromDocumentLookup
+ self.insertForwardEntry = self.insertForwardEntryDocumentLookup
+
+
# get splitter function
self._splitterfunc = self._stemmerfunc = None
@@ -147,8 +182,10 @@
idx[documentId].update(widLst)
- def insertForwardEntry(self,wordId,pos,documentId):
- """ insert entries for forward index """
+ def insertForwardEntryInternal(self,wordId,pos,documentId):
+ """ insert entries for forward index. This function stores
+ the word positions internally.
+ """
# self._IDX is a mapping:
# wordId -> documentId -> [positions]
@@ -165,6 +202,23 @@
tree[documentId].insert(pos)
+ def insertForwardEntryDocumentLookup(self,wordId,pos,documentId):
+ """ insert entries for forward index. This function does not store
+ word positions. Word positions are calculated when document is in the
+ hitlist.
+ """
+
+ # self._IDX is a mapping:
+ # wordId -> documentId -> [positions]
+
+ idx = self._IDX
+
+ if idx.has_key(wordId) == 0:
+ idx[wordId] = IISet()
+
+ idx[wordId].insert(documentId)
+
+
def _printIndex(self):
for wordId in self._IDX.keys():
@@ -223,7 +277,9 @@
pos+=1
- self.insertBackwardEntries(self,widLst,documentId)
+ self.insertBackwardEntries(widLst,documentId)
+
+ return len(widLst)
def unindex_object(self, documentId):
@@ -281,7 +337,7 @@
else:
r={}
-
+
return ResultListNG(r, (word,), self)
@@ -323,7 +379,7 @@
if not key:
continue
- b = self.query(key, query_operator)
+ b = self.query(key, query_operator).keys()
w, r = weightedIntersection(r, b)
if r is not None:
@@ -332,7 +388,41 @@
return (IIBucket(), (self.id,))
- def positions(self,docId, words):
+ def positionsByDocumentLookup(self,docId, words):
+ """ search all positions for a list of words for
+ a given document given by its documentId.
+ positions() returns a mapping word to
+ list of positions of the word inside the document.
+ """
+
+ debug('searching positions docid: %s, words: %s' % (docId,words))
+
+ res = OOBTree()
+
+ # obtain object from ZCatalog
+ # this implementation must be changed for ZCatalog usage
+ # (for testing purposes we read from the filesystem)
+
+ uid = self.catalog.paths[docId]
+ data = open(uid).read() # CHANGE THAT !!!
+
+ # Split retrieved document and obtain list of word positions
+
+ SP = self._splitterfunc(data)
+
+ for word in words:
+ posLst = SP.indexes(word)
+
+ res[word] = IISet(posLst)
+
+ for k,v in res.items():
+ debug(k,v)
+
+ return res
+
+
+
+ def positionsFromInternalStorage(self,docId, words):
""" search all positions for a list of words for
a given document given by its documentId.
positions() returns a mapping word to