[Zope-CVS] CVS: Products/ZCTextIndex - ILexicon.py:1.1.2.4 Lexicon.py:1.1.2.5 ZCTextIndex.py:1.1.2.10
Guido van Rossum
guido@python.org
Fri, 3 May 2002 13:18:31 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv12927
Modified Files:
Tag: TextIndexDS9-branch
ILexicon.py Lexicon.py ZCTextIndex.py
Log Message:
Make the splitter part of the normal pipeline.
Use *args syntax to pass the pipeline elements to the Lexicon.
=== Products/ZCTextIndex/ILexicon.py 1.1.2.3 => 1.1.2.4 ===
"""Return a sequence of ids of the words parsed from the text.
+ The input text may be either a string or a list of strings.
+
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
"""
def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
+
+ The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
=== Products/ZCTextIndex/Lexicon.py 1.1.2.4 => 1.1.2.5 ===
class Lexicon:
- def __init__(self, splitter, pipeline=()):
+
+ def __init__(self, *pipeline):
self.__wids = OIBTree()
self.__words = IOBTree()
# XXX we're reserving wid 0, but that might be yagni
- self.__lastwid = 1
+ self.__nextwid = 1
self.__pipeline = pipeline
- self.__splitter = splitter
def length(self):
# Return the number of unique terms in the lexicon
- return self.__lastwid - 1
+ return self.__nextwid - 1
def sourceToWordIds(self, text):
- last = self.__splitter.process(text)
+ last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
wids = []
@@ -38,7 +38,7 @@
return wids
def termToWordIds(self, text):
- last = self.__splitter.process(text)
+ last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
wids = []
@@ -57,6 +57,15 @@
return wid
def __new_wid(self):
- wid = self.__lastwid
- self.__lastwid += 1
+ wid = self.__nextwid
+ self.__nextwid += 1
return wid
+
+def _text2list(text):
+ # Helper: splitter input may be a string or a list of strings
+ try:
+ text + ""
+ except:
+ return text
+ else:
+ return [text]
=== Products/ZCTextIndex/ZCTextIndex.py 1.1.2.9 => 1.1.2.10 ===
def __init__(self, doc_attr="text"):
self._fieldname = doc_attr
- self.lexicon = Lexicon(Splitter(),
- [CaseNormalizer(), StopWordRemover()])
+ self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
self.engine = QueryEngine()
self.index = Index(self.lexicon)
self.parser = QueryParser()
@@ -68,8 +67,11 @@
class Splitter:
- def process(self, text):
- return re.findall(r"\w+", text)
+ def process(self, lst):
+ result = []
+ for s in lst:
+ result += re.findall(r"\w+", s)
+ return result
class CaseNormalizer: