[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.1.2.1
Jeremy Hylton
jeremy@zope.com
Thu, 9 May 2002 19:18:37 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv1488
Added Files:
Tag: TextIndexDS9-branch
HTMLSplitter.py
Log Message:
Add a simple splitter for html documents and a more featureful one.
=== Added File Products/ZCTextIndex/HTMLSplitter.py ===
from Products.ZCTextIndex.ISplitter import ISplitter
import re
class HTMLSplitter:
__implements__ = ISplitter
def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()
class HTMLWordSplitter:
__implements__ = ISplitter
def process(self, text):
text = text[0] # XXX
text = text.lower()
remove = ["<[^>]*>",
"&[A-Za-z]+;",
"\W+"]
for pat in remove:
text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process(buf)