[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.33
Andreas Jung
andreas@digicool.com
Thu, 14 Feb 2002 17:57:58 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv11733
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
- reimplemented stopwords loop
- some minor performance tweaks for inserting forward entries
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.32 => 1.2.2.33 ===
from SimilarityLexicon import SimilarityLexicon
-from types import IntType, StringType, UnicodeType, InstanceType, DictType
+from types import IntType, StringType, UnicodeType, InstanceType, DictType, ListType
from TextOperators import *
from TextIndexCommon import *
@@ -47,6 +47,7 @@
import Stemmer, Proximity as Similarity
import Thesaurus, StopWords, Normalizer
+import indexsupport
import TextIndexCommon
import time
@@ -76,7 +77,7 @@
print "total: %5.5lf secs" % total
for k,v in self.d.items():
- print "%-20s %5.4lf secs (%5.2lf %%) " % (k,v,100.0*v/total)
+ print "%-20s %5.5lf secs (%5.2lf %%) " % (k,v,100.0*v/total)
@@ -178,7 +179,7 @@
# Normalizer: characterMapping
self.charMapping = getattr(extra,'characterMapping', None) or None
-
+
if verbose: self.debugOn()
else: self.debugOff()
@@ -213,9 +214,10 @@
else:
self._thesaurus = None
+
# Normalizer
- if isinstance(self.charMapping, StringType):
+ if type(self.charMapping) in (StringType,ListType):
self._normalizer = Normalizer.Normalizer(self.charMapping)
elif isinstance(self.charMapping, InstanceType):
self._normalizer = self.charMapping
@@ -303,11 +305,11 @@
"""
idx = self._invIDX
-
- if idx.has_key(documentId)==0:
- idx[documentId] = IISet(widLst)
- else:
+
+ try:
idx[documentId].update(widLst)
+ except:
+ idx[documentId] = IISet(widLst)
@@ -328,10 +330,11 @@
for wordId in wordIds:
- if idx.has_key(wordId) == 0:
+ try:
+ idx[wordId].insert(documentId)
+ except:
idx[wordId] = IISet()
-
- idx[wordId].insert(documentId)
+ idx[wordId].insert(documentId)
def insertSimilarityEntries(self,wordIds,documentId):
@@ -398,6 +401,7 @@
T("encoding")
+
# Split the text into a list of words
# The splitterfunc just returns an iterator-like object.
@@ -411,10 +415,8 @@
T("Splitter")
# apply stopwords list
- # Maybe this should go into a C extension for performance reasons
- isStopWord = self._stopwords.has_key
- words = filter(lambda x,f=isStopWord: f(x)==0, words)
+ words = indexsupport.stopwordfilter(words, self._stopwords.getDict())
T("Stopwords")
# Check if we want Similarity searches. If yes, we need to create
@@ -436,8 +438,9 @@
T("Stemmer")
+
# Normalization
-
+
if self._normalizer:
words = self._normalizer.normalize(words)