[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG - TextIndexNG.py:1.2.2.16

Andreas Jung andreas@digicool.com
Mon, 21 Jan 2002 15:12:18 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG
In directory cvs.zope.org:/tmp/cvs-serv28022

Modified Files:
      Tag: ajung-textindexng-branch
	TextIndexNG.py 
Log Message:
- code cleanup
- incorporated all splitter extensions


=== Zope/lib/python/Products/PluginIndexes/TextIndexNG/TextIndexNG.py 1.2.2.15 => 1.2.2.16 ===
         self.useSplitter   = getattr(extra,'useSplitter',   'ZopeSplitter')
 
-
         # max len of splitted words
         self.splitterMaxLen= getattr(extra,'splitterMaxLen', 64)
 
@@ -109,6 +108,9 @@
         # allow single characters
         self.splitterSingleChars   = getattr(extra,'splitterSingleChars',0)
 
+        # allow single characters
+        self.splitterCasefolding   = getattr(extra,'splitterCasefolding',1)
+
         # name of stemmer or None
         self.useStemmer    = getattr(extra,'useStemmer',    None) or None
 
@@ -121,17 +123,14 @@
         # lexicon to be used (name, reference or None(internal))
         self.lexicon       = getattr(extra,'lexicon',       None) or None
 
-        # support near search: 1/0 (requires more storage)
-        self.useNearSearch = getattr(extra,'useNearSearch', 1)
-
         # default maximum distance for words with near search
         self.nearDistance  = getattr(extra,'nearDistance',  5)
 
         # use proximity algorithm
         self.useProximity  = getattr(extra,'useProximity',  None) or None
 
-        # storage of positions for near search ('internal','documentLookup')
-        self.nearStorage   = getattr(extra,'nearStorage',  'internal')
+        # Support for near search (None,'internal','documentLookup')
+        self.nearSearch   = getattr(extra,'nearSearch',  None)
 
         # Stopwords: either filename or StopWord object
         self.stopWords     = getattr(extra,'stopWords',    None) or None
@@ -139,11 +138,10 @@
         # Thesaurus: either filename or StopWord object
         self.thesaurus     = getattr(extra,'thesaurus',    None) or None
 
-        if not self.nearStorage in ('internal','documentLookup'):
-            raise ValueError,'nearStorage must be either "internal"'\
+        if not self.nearSearch in (None,'internal','documentLookup'):
+            raise ValueError,'nearSearch must be either None, "internal"'\
                              ' or "documentLookup"'
 
-
         self.clear()
                         
 
@@ -188,7 +186,7 @@
 
 
         # near Search
-        if self.nearStorage == 'internal':
+        if self.nearSearch == 'internal':
             self._v_positions = self.positionsFromInternalStorage
             self._v_insertForwardEntry = self.insertForwardEntryInternal
         else:
@@ -375,7 +373,13 @@
         # Split the text into a list of words
         # The splitterfunc just returns an iterator-like object.
 
-        words = self._v_splitterfunc(source,encoding=encoding).split()
+        words = self._v_splitterfunc(source,
+                            encoding     = encoding,
+                            casefolding  = self.splitterCasefolding,
+                            maxlen       = self.splitterMaxLen,
+                            indexnumbers = self.splitterIndexNumbers,
+                            singlechar   = self.splitterSingleChars
+                            ).split()
         T("Splitter")
 
         # apply stopwords list 
@@ -515,6 +519,11 @@
             word = self._v_stemmerfunc(word)
             debug("\tStemming: ", word)
 
+        # perform casefolding if necessary
+        if self.splitterCasefolding:
+            word = word.lower()
+            debug('\tCasefolding: ',word)
+
         # Lookup list of wordIds (usually should contain only *one*)
         wids = self._v_getIdByWord(word)
         debug("\tWids: ", wids)
@@ -526,8 +535,9 @@
         # (documentId, list of positions) for one word/wid
         docIds = self._IDX.get(wids[0])
 
-        debug('\tDocIds: ', list(docIds.keys()))
-        debug('\tPositions: ', list(docIds.values()))
+    
+#        debug('\tDocIds: ', list(docIds.keys()))
+#        debug('\tPositions: ', list(docIds.values()))
 
         r = ResultSet( docIds, (word,))