[Zope-CVS] CVS: Products/ZCTextIndex - ILexicon.py:1.1.2.4 Lexicon.py:1.1.2.5 ZCTextIndex.py:1.1.2.10

Guido van Rossum guido@python.org
Fri, 3 May 2002 13:18:31 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv12927

Modified Files:
      Tag: TextIndexDS9-branch
	ILexicon.py Lexicon.py ZCTextIndex.py 
Log Message:
Make the splitter part of the normal pipeline.
Use *args syntax to pass the pipeline elements to the Lexicon.


=== Products/ZCTextIndex/ILexicon.py 1.1.2.3 => 1.1.2.4 ===
         """Return a sequence of ids of the words parsed from the text.
 
+        The input text may be either a string or a list of strings.
+
         Parses the text as if they are search terms, and skips words that
         aren't in the lexicon.
         """
 
     def sourceToWordIds(text):
         """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
 
         Parses the text as if they come from a source document, and creates
         new word ids for words that aren't (yet) in the lexicon.


=== Products/ZCTextIndex/Lexicon.py 1.1.2.4 => 1.1.2.5 ===
 
 class Lexicon:
-    def __init__(self, splitter, pipeline=()):
+
+    def __init__(self, *pipeline):
         self.__wids = OIBTree()
         self.__words = IOBTree()
         # XXX we're reserving wid 0, but that might be yagni
-        self.__lastwid = 1
+        self.__nextwid = 1
         self.__pipeline = pipeline
-        self.__splitter = splitter
 
     def length(self):
         # Return the number of unique terms in the lexicon
-        return self.__lastwid - 1
+        return self.__nextwid - 1
 
     def sourceToWordIds(self, text):
-        last = self.__splitter.process(text)
+        last = _text2list(text)
         for element in self.__pipeline:
             last = element.process(last)
         wids = []
@@ -38,7 +38,7 @@
         return wids
         
     def termToWordIds(self, text):
-        last = self.__splitter.process(text)
+        last = _text2list(text)
         for element in self.__pipeline:
             last = element.process(last)
         wids = []
@@ -57,6 +57,15 @@
         return wid
 
     def __new_wid(self):
-        wid = self.__lastwid
-        self.__lastwid += 1
+        wid = self.__nextwid
+        self.__nextwid += 1
         return wid
+
+def _text2list(text):
+    # Helper: splitter input may be a string or a list of strings
+    try:
+        text + ""
+    except:
+        return text
+    else:
+        return [text]


=== Products/ZCTextIndex/ZCTextIndex.py 1.1.2.9 => 1.1.2.10 ===
     def __init__(self, doc_attr="text"):
         self._fieldname = doc_attr
-        self.lexicon = Lexicon(Splitter(),
-                               [CaseNormalizer(), StopWordRemover()])
+        self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
         self.engine = QueryEngine()
         self.index = Index(self.lexicon)
         self.parser = QueryParser()
@@ -68,8 +67,11 @@
 
 class Splitter:
 
-    def process(self, text):
-        return re.findall(r"\w+", text)
+    def process(self, lst):
+        result = []
+        for s in lst:
+            result += re.findall(r"\w+", s)
+        return result
 
 class CaseNormalizer: