[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.7

Andreas Jung andreas@zope.com
Mon, 7 Jan 2002 15:22:39 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv1777

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
+ StopWord handling
+ prelim. support for thesaurii


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.6 => 1.2.2.7 ===
 from Products.PluginIndexes.TextIndex import Splitter
 
-import Stemmer
 
-from types import IntType, StringType, UnicodeType
+from types import IntType, StringType, UnicodeType, InstanceType
 from TextOperators import *
 from TextIndexCommon import *
 
+import Stemmer
+import Thesaurus, StopWords
+
 
 class TextIndexNG(PluggableIndex.PluggableIndex, Persistent,
      Implicit, SimpleItem):
@@ -99,6 +101,12 @@
         # storage of positions for near search ('internal','documentLookup')
         self.nearStorage   = getattr(extra,'nearStorage',  'internal')
 
+        # Stopwords: either filename or StopWord object
+        self.stopWords     = getattr(extra,'stopWords',    None)
+     
+        # Thesaurus: either filename or StopWord object
+        self.thesaurus     = getattr(extra,'thesaurus',    None)
+
         if self.lexicon == 'None':    self.lexicon    = None
         if self.useStemmer == 'None': self.useStemmer = None
     
@@ -111,8 +119,24 @@
 
     def clear(self):
 
-        self._IDX     = IOBTree()
-        self._invIDX  = IOBTree()
+        self._IDX        = IOBTree()
+        self._invIDX     = IOBTree()
+        self._thesaurus  = None
+        self._stopwords  = StopWords.StopWords()
+
+
+        # Thesaurus
+        if isinstance(self.thesaurus, StringType):
+            self._thesaurus =  Thesaurus.Thesaurus(self.thesaurus)
+        elif isinstance(self.thesaurus, InstanceType):
+            self._thesaurus = self.thesaurus
+
+        # StopWords
+
+        if isinstance(self.stopWords, StringType):
+            self._stopwords =  StopWords.StopWords(self.stopWords)
+        elif isinstance(self.stopWords, InstanceType):
+            self._stopwords = self.stopWords
 
         # near Search
         if self.nearStorage == 'internal':
@@ -251,6 +275,9 @@
 
 
         # Split the text into a list of words
+        # The splitterfunc just returns an iterator-like object.
+        # For performance reasons it might be better when the 
+        # splitter returns the list of splitted words.
 
         words = self._splitterfunc(source,encoding=encoding)
 
@@ -259,10 +286,13 @@
         # every single wordId
 
         widLst = []
+        isStopWord = self._stopwords.has_key
 
         pos = 0    
         for word in words:
 
+            if isStopWord(word):  continue
+
             # stem the single word        
             if self._stemmerfunc:
                 word = self._stemmerfunc(word)
@@ -332,8 +362,15 @@
                 r = {}
 
                 res  = self._IDX.get(wids[0], None)
-                for docId in  res.keys():
-                    r[docId] = self._IDX[wids[0]][docId]
+
+                if self.nearStorage == 'internal':
+                    for docId in  res.keys():
+                        r[docId] = self._IDX[wids[0]][docId]
+
+                else: 
+                    for docId in res:
+                        r[docId] = IISet()
+
 
             else:
                 r={}
@@ -388,7 +425,7 @@
         return (IIBucket(), (self.id,))
 
 
-    def positionsByDocumentLookup(self,docId, words):
+    def positionsFromDocumentLookup(self,docId, words):
         """ search all positions for a list of words for
             a given document given by its documentId.
             positions() returns a mapping word to