[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.33

Andreas Jung andreas@digicool.com
Thu, 14 Feb 2002 17:57:58 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv11733

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
- reimplemented stopwords loop
- some minor performance tweaks for inserting forward entries


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.32 => 1.2.2.33 ===
 from SimilarityLexicon import SimilarityLexicon
 
-from types import IntType, StringType, UnicodeType, InstanceType, DictType
+from types import IntType, StringType, UnicodeType, InstanceType, DictType, ListType
 from TextOperators import *
 from TextIndexCommon import *
 
@@ -47,6 +47,7 @@
 
 import Stemmer, Proximity as Similarity
 import Thesaurus, StopWords, Normalizer
+import indexsupport
 import TextIndexCommon
 import time
 
@@ -76,7 +77,7 @@
 
         print "total: %5.5lf secs" % total
         for k,v in self.d.items():
-            print "%-20s  %5.4lf secs   (%5.2lf %%) " % (k,v,100.0*v/total)    
+            print "%-20s  %5.5lf secs   (%5.2lf %%) " % (k,v,100.0*v/total)    
         
   
 
@@ -178,7 +179,7 @@
 
         # Normalizer: characterMapping
         self.charMapping   = getattr(extra,'characterMapping', None) or None
-        
+
         if verbose: self.debugOn()
         else:       self.debugOff()
 
@@ -213,9 +214,10 @@
         else:
             self._thesaurus = None
 
+
         # Normalizer 
 
-        if isinstance(self.charMapping, StringType):
+        if type(self.charMapping) in (StringType,ListType):
             self._normalizer = Normalizer.Normalizer(self.charMapping)
         elif isinstance(self.charMapping, InstanceType):
             self._normalizer = self.charMapping
@@ -303,11 +305,11 @@
         """
 
         idx = self._invIDX
-
-        if idx.has_key(documentId)==0:
-            idx[documentId] = IISet(widLst)
-        else:
+        
+        try:
             idx[documentId].update(widLst)
+        except:
+            idx[documentId] = IISet(widLst)
 
 
 
@@ -328,10 +330,11 @@
 
         for wordId in wordIds:
 
-            if idx.has_key(wordId) == 0:
+            try:
+                idx[wordId].insert(documentId)
+            except:
                 idx[wordId] = IISet()
-
-            idx[wordId].insert(documentId)
+                idx[wordId].insert(documentId)
 
 
     def insertSimilarityEntries(self,wordIds,documentId):
@@ -398,6 +401,7 @@
 
         T("encoding")
 
+
         # Split the text into a list of words
         # The splitterfunc just returns an iterator-like object.
 
@@ -411,10 +415,8 @@
         T("Splitter")
 
         # apply stopwords list 
-        # Maybe this should go into a C extension for performance reasons
 
-        isStopWord = self._stopwords.has_key
-        words =  filter(lambda x,f=isStopWord: f(x)==0, words)   
+        words = indexsupport.stopwordfilter(words, self._stopwords.getDict())
         T("Stopwords")
 
         # Check if we want Similarity searches. If yes, we need to create
@@ -436,8 +438,9 @@
 
         T("Stemmer")
 
+
         # Normalization
-        
+
         if self._normalizer:
             words = self._normalizer.normalize(words)