[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.9
Guido van Rossum
guido@python.org
Wed, 22 May 2002 16:06:55 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv9545
Modified Files:
HTMLSplitter.py
Log Message:
Get rid of the unused HTMLSplitter class (it's too simple).
Add glob support to the HTMLWordSplitter class.
=== Products/ZCTextIndex/HTMLSplitter.py 1.8 => 1.9 ===
import re
-class HTMLSplitter:
-
- __implements__ = ISplitter
-
- def process(self, text):
- return re.sub('<[^>]*>', ' ', text).split()
-
class HTMLWordSplitter:
__implements__ = ISplitter
- def process(self, text):
+ def process(self, text, wordpat=r"\w+"):
splat = []
for t in text:
- splat += self._split(t)
+ splat += self._split(t, wordpat)
return splat
- def _split(self, text):
+ def processGlob(self, text):
+ return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
+
+ def _split(self, text, wordpat):
text = text.lower()
- remove = ["<[^>]*>",
- "&[A-Za-z]+;",
- "\W+"]
+ remove = [r"<[^<>]*>",
+ r"&[A-Za-z]+;"]
for pat in remove:
text = re.sub(pat, " ", text)
- rx = re.compile("[A-Za-z]")
- return [word for word in text.split()
- if len(word) > 1 and rx.search(word)]
+ return re.findall(wordpat, text)
element_factory.registerFactory('Word Splitter',
'HTML aware splitter',