[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.27

Andreas Jung andreas@digicool.com
Tue, 12 Feb 2002 14:29:30 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv27494

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
- additional checks for parameters
- minor API cleanup
- prelim. support for NEAR search


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.26 => 1.2.2.27 ===
 
 
-class QueryException(Exception): pass
+class TextIndexNGException(Exception): pass
 
 
 class TextIndexNG(PluggableIndex.PluggableIndex, Persistent,
@@ -91,25 +91,36 @@
          'help': ('TextIndex','TextIndex_Settings.stx')},
     )
 
+    _all_options = ('useSplitter','splitterMaxLen','splitterIndexNumbers',
+         'splitterSingleChars','splitterCasefolding','useStemmer','useOperator',
+         'useGlobbing','lexicon','nearDistance','useProximity','nearSearch',
+         'stopWords','thesaurus','characterMapping'
+        )
+
     query_options = ("query","operator")
     _valid_default_operators    =  ('and','or')
  
 
     def __init__(self 
                  , id 
-                 , extra= None
-                 , caller = None
-                 , verbose = 0
-                 , timed_statistics = 0
+                 , extra= None                # record with all parameters
+                 , caller = None              # ZCatalog instance (old crap)
+                 , verbose = 0                # verbose mode 
+                 , timed_statistics = 0       # timed statistics 
                 ):
 
-        self.id            = id
-
         debug(extra)
 
+        # check parameters
+        for k in dir(extra):
+            if not k in self._all_options:
+                raise TextIndexNGException,'unknown parameter "%s"' % k
 
         # reference to catalog (can we get rid of that ?)
         self.catalog       = caller
+        
+        # our own Id
+        self.id            = id
 
         # name of splitter        
         self.useSplitter   = getattr(extra,'useSplitter',   'ZopeSplitter')
@@ -138,9 +149,6 @@
         # lexicon to be used (name, reference or None(internal))
         self.lexicon       = getattr(extra,'lexicon',       None) or None
 
-        # default maximum distance for words with near search
-        self.nearDistance  = getattr(extra,'nearDistance',  5)
-
         # use proximity algorithm
         self.useProximity  = getattr(extra,'useProximity',  None) or None
 
@@ -148,34 +156,40 @@
         self.nearSearch   = getattr(extra,'nearSearch',  None)
         if self.nearSearch=='None': self.nearSearch = None
 
+        # default maximum distance for words with near search
+        self.nearDistance  = getattr(extra,'nearDistance',  5)
+
         # Stopwords: either filename or StopWord object
         self.stopWords     = getattr(extra,'stopWords',    None) or None
      
         # Thesaurus: either filename or StopWord object
         self.thesaurus     = getattr(extra,'thesaurus',    None) or None
 
-        # Thesaurus: either filename or StopWord object
-        self.thesaurus     = getattr(extra,'thesaurus',    None) or None
-
         # Normalizer: characterMapping
         self.charMapping   = getattr(extra,'characterMapping', None) or None
 
-
         if not self.nearSearch in (None,'internal','documentLookup'):
             raise ValueError,'nearSearch must be either None, "internal"'\
                              ' or "documentLookup"'
-
         
-        if verbose: TextIndexCommon.debug_on()
-        else:       TextIndexCommon.debug_off()
+        if verbose: self.debugOn()
+        else:       self.debugOff()
 
         self.timed_statistics  = timed_statistics 
 
         self.clear()
-                        
+
+
+    def debugOn(self):          TextIndexCommon.debug_on()
+    def debugOff(self):         TextIndexCommon.debug_off()
+
+    def timedStatsOn(self):     self.timed_statistics = 1
+    def timedStatsOff(self):    self.timed_statistics = 0
 
     def clear(self):
 
+        self.__OBJECTS   = IOBTree()  # mapping RID->object reference 
+
         self._IDX        = IOBTree()
         self._invIDX     = IOBTree()
         self._thesaurus  = None
@@ -387,6 +401,11 @@
 
     def index_object(self, documentId, obj, threshold=None):
 
+        # HACK !
+        # We store references to the object for testing purposes
+        # only. A later implementation must be more clever
+
+        self.__OBJECTS[documentId] = obj
 
         T = Timer(self.timed_statistics)
 
@@ -529,7 +548,14 @@
 
         res = self.query( q )
 
-        return res
+        # this should be fixed
+        bucket = IIBucket()
+
+        for docId in res[0]:
+            bucket[ docId ] = 1 
+
+        print bucket
+        return res[0], (self.id, )
 
 
     def query(self, q):
@@ -552,6 +578,7 @@
         debug('Result:',res)
 
         # Bah....this sucks 
+
         return res.docIds(), (self.id,)
 
 
@@ -601,7 +628,7 @@
         debug("ProximityLexionLookup: ",word)
 
         if not self.useProximity:
-            raise QueryException, 'proximity search is not enabled'
+            raise TextIndexNGException, 'proximity search is not enabled'
 
 
         # Lookup list of wordIds (usually should contain only *one*)
@@ -668,80 +695,91 @@
         
 
     def txNear(self, *sets):
-        """ perform near intersection of two ResultSets """
+        """ perform near search on results sets """
+        
+        # One resultset consists of an IISet() or documentIds and 
+        # tuple whose first element is the word (from LexiconLookup())
+        #  
 
-        # first we perform an intersection
+        # First we perform an intersection to get the documentIds of
+        # those documents that contain all the words
 
         r = self.txIntersection(*sets)
+        docIds = r.docIds()
 
-        # XXX: Near search ...
+        # Now we determine for every document the positions of all
+        # the words inside the document. then we compare all the positions
+        # to determine neighbourship
 
-        return 
+        words = []
+        for set in sets:  words.append(set.words()[0])
 
-        
+        res_docIds = IISet()
 
-    def positionsFromDocumentLookup(self,docId, words):
-        """ search all positions for a list of words for
-            a given document given by its documentId.
-            positions() returns a mapping word to
-            list of positions of the word inside the document.
-        """
+        for docId in docIds:
 
-        debug('searching positions docid: %s, words: %s' % (docId,words))
+            posMap = self.positionsFromDocumentLookup(docId, words)
 
-        res = OOBTree()
+            debug("Position Map for NEAR:")
+            for k,v in posMap.items(): debug("\t'%s'" % k,":","'%s'" % v)
 
-        # obtain object from ZCatalog
-        # this implementation must be changed for ZCatalog usage
-        # (for testing purposes we read from the filesystem)
+            if self.checkPositionMap( posMap):
+                res_docIds.insert(docId)
 
-        uid = self.catalog.paths[docId]
-        data = open(uid).read()   # CHANGE THAT !!!
+        r = ResultSet( docIds,  words)       
+        debug("\treturn: ",r)
 
-        # Split retrieved document and obtain list of word positions
+        return r
+        
+    def checkPositionMap(self, posMap):
+        """ check if a PositionMap represents a valid match for
+            a near search
+        """
 
-        SP = self._v_splitterfunc(data)
+        # a posMap is a mapping for word to an IISet() to posititions
+        # of that word inside one document
 
-        for word in words:
-            posLst = SP.indexes(word)        
+        # to be written
 
-            res[word] = IISet(posLst)
-      
-        for k,v in  res.items():
-            debug(k,v)
+        return 1
 
-        return res
 
+    # THIS IS A BAD BAD HACK !
 
+    def getDataFromObject(self, docId):
+        obj = self.__OBJECTS[docId]
+        data = getattr(obj,self.id)
+        return data
 
-    def positionsFromInternalStorage(self,docId, words):
+
+    def positionsFromDocumentLookup(self,docId, words):
         """ search all positions for a list of words for
             a given document given by its documentId.
             positions() returns a mapping word to
             list of positions of the word inside the document.
         """
-      
-        debug('searching positions docid: %s, words: %s' % (docId,words))
 
-        res = OOBTree()
+        debug('searching positions docid: %s, word: %s' % (docId,words))
 
-        for w in words:
+        res = OOBTree()
 
-            if isinstance(w,IntType):
-                wid = w
-                word = self._v_getWordById(w) 
-            else:
-                wid = self._v_getIdByWord(w)[0]
-                word = w
+        # obtain object from ZCatalog
+        # THis is a bad hack !
+        data = self.getDataFromObject( docId )         
 
-            if self._IDX[wid].has_key(docId):
-                res[w] = self._IDX[wid][docId]
+        # Split retrieved document and obtain list of word positions
 
+        SP = self._v_splitterfunc(data)
 
-        for k,v in res.items():
-            debug(k,v)
+        for word in words:
+            
+            posLst = SP.indexes(word)        
+            res[word] = IISet(posLst)
+      
+        for k,v in  res.items(): debug(k,':',v)
 
         return res
+
 
 
     def numObjects(self):