[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.41
Andreas Jung
andreas@digicool.com
Thu, 21 Feb 2002 20:35:20 -0500
Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv22963/lib/python/Products/PluginIndexes/TextIndexNG
Modified Files:
Tag: ajung-textindexng-branch
TextIndexNG.py
Log Message:
- stemmer and splitter support fully functional
- near search mostly working (positionMap handling needs some
tweaks to handle phrase searches with positional arguments)
=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.40 => 1.2.2.41 ===
# allow single characters
- self.splitterCasefolding = getattr(extra,'splitterCasefolding',1) or 1
+ self.splitterCasefolding = getattr(extra,'splitterCasefolding',1)
# name of stemmer or None
- self.useStemmer = getattr(extra,'useStemmer', None)
+ self.useStemmer = getattr(extra,'useStemmer', None) or None
# default operator to combine queries
self.useOperator = getattr(extra,'useOperator', 'and')
@@ -250,19 +250,6 @@
self._v_Similarityfunc = getattr(Similarity,self.useSimilarity)
- # get splitter function
-
- self.splitterfunc = self.stemmerfunc = None
-
- if self.useSplitter:
- self.splitterfunc = Splitter.getSplitter(self.useSplitter)
-
-
- # stemmer function
-
- if self.useStemmer:
- self.stemmerfunc = Stemmer.Stemmer(self.useStemmer).stem
-
if self.lexicon:
# try to get lexicon through acquisition
@@ -275,9 +262,9 @@
self._LEXICON = GlobbingLexiconNG()
debug('created new globbing lexicon')
- if self.stemmerfunc:
+ if self.useStemmer:
debug('stemming disabled because globbing enabled')
- self.stemmerfunc = None
+ self.useStemmer= None
else:
self._LEXICON = LexiconNG()
@@ -414,15 +401,17 @@
# Split the text into a list of words
- # The splitterfunc just returns an iterator-like object.
- words = self.splitterfunc(source,
- encoding = encoding,
- casefolding = self.splitterCasefolding,
- maxlen = self.splitterMaxLen,
- indexnumbers = self.splitterIndexNumbers,
- singlechar = self.splitterSingleChars
- ).split()
+ SP = Splitter.getSplitter(self.useSplitter)
+
+ words = SP( source,
+ encoding = encoding,
+ casefolding = self.splitterCasefolding,
+ maxlen = self.splitterMaxLen,
+ indexnumbers = self.splitterIndexNumbers,
+ singlechar = self.splitterSingleChars
+ ).split()
+
T("Splitter")
# apply stopwords list
@@ -444,8 +433,9 @@
# Stem all words in one run
- if self.stemmerfunc:
- words = self.stemmerfunc(words)
+ if self.useStemmer:
+ ST = Stemmer.Stemmer( self.useStemmer )
+ words = ST.stem(words)
T("Stemmer")
@@ -579,7 +569,8 @@
# Stem the word if necessary
if self.useStemmer:
- word = self.stemmerfunc(word)
+ ST = Stemmer.Stemmer( self.useStemmer )
+ word = ST.stem(word)
debug("\tStemming: ", word)
# perform casefolding if necessary
@@ -808,7 +799,7 @@
# Split retrieved document and obtain list of word positions
- SP = self.splitterfunc(data)
+ SP = Splitter.getSplitter( self.useSplitter )( data )
for word in words: