[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.7
Andreas Jung
andreas@zope.com
Mon, 7 Jan 2002 15:22:39 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv1777
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
+ StopWord handling
+ prelim. support for thesaurii
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.6 => 1.2.2.7 ===
from Products.PluginIndexes.TextIndex import Splitter
-import Stemmer
-from types import IntType, StringType, UnicodeType
+from types import IntType, StringType, UnicodeType, InstanceType
from TextOperators import *
from TextIndexCommon import *
+import Stemmer
+import Thesaurus, StopWords
+
class TextIndexNG(PluggableIndex.PluggableIndex, Persistent,
Implicit, SimpleItem):
@@ -99,6 +101,12 @@
# storage of positions for near search ('internal','documentLookup')
self.nearStorage = getattr(extra,'nearStorage', 'internal')
+ # Stopwords: either filename or StopWord object
+ self.stopWords = getattr(extra,'stopWords', None)
+
+ # Thesaurus: either filename or StopWord object
+ self.thesaurus = getattr(extra,'thesaurus', None)
+
if self.lexicon == 'None': self.lexicon = None
if self.useStemmer == 'None': self.useStemmer = None
@@ -111,8 +119,24 @@
def clear(self):
- self._IDX = IOBTree()
- self._invIDX = IOBTree()
+ self._IDX = IOBTree()
+ self._invIDX = IOBTree()
+ self._thesaurus = None
+ self._stopwords = StopWords.StopWords()
+
+
+ # Thesaurus
+ if isinstance(self.thesaurus, StringType):
+ self._thesaurus = Thesaurus.Thesaurus(self.thesaurus)
+ elif isinstance(self.thesaurus, InstanceType):
+ self._thesaurus = self.thesaurus
+
+ # StopWords
+
+ if isinstance(self.stopWords, StringType):
+ self._stopwords = StopWords.StopWords(self.stopWords)
+ elif isinstance(self.stopWords, InstanceType):
+ self._stopwords = self.stopWords
# near Search
if self.nearStorage == 'internal':
@@ -251,6 +275,9 @@
# Split the text into a list of words
+ # The splitterfunc just returns an iterator-like object.
+ # For performance reasons it might be better when the
+ # splitter returns the list of splitted words.
words = self._splitterfunc(source,encoding=encoding)
@@ -259,10 +286,13 @@
# every single wordId
widLst = []
+ isStopWord = self._stopwords.has_key
pos = 0
for word in words:
+ if isStopWord(word): continue
+
# stem the single word
if self._stemmerfunc:
word = self._stemmerfunc(word)
@@ -332,8 +362,15 @@
r = {}
res = self._IDX.get(wids[0], None)
- for docId in res.keys():
- r[docId] = self._IDX[wids[0]][docId]
+
+ if self.nearStorage == 'internal':
+ for docId in res.keys():
+ r[docId] = self._IDX[wids[0]][docId]
+
+ else:
+ for docId in res:
+ r[docId] = IISet()
+
else:
r={}
@@ -388,7 +425,7 @@
return (IIBucket(), (self.id,))
- def positionsByDocumentLookup(self,docId, words):
+ def positionsFromDocumentLookup(self,docId, words):
""" search all positions for a list of words for
a given document given by its documentId.
positions() returns a mapping word to