[Zope-CVS] CVS: Products/ZCTextIndex - HTMLSplitter.py:1.1.2.1

Jeremy Hylton jeremy@zope.com
Thu, 9 May 2002 19:18:37 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv1488

Added Files:
      Tag: TextIndexDS9-branch
	HTMLSplitter.py 
Log Message:
Add a simple splitter for html documents and a more featureful one.



=== Added File Products/ZCTextIndex/HTMLSplitter.py ===
from Products.ZCTextIndex.ISplitter import ISplitter

import re

class HTMLSplitter:

    __implements__ = ISplitter

    def process(self, text):
        return re.sub('<[^>]*>', ' ', text).split()

class HTMLWordSplitter:

    __implements__ = ISplitter

    def process(self, text):
        text = text[0] # XXX
        text = text.lower()
        remove = ["<[^>]*>",
                  "&[A-Za-z]+;",
                  "\W+"]
        for pat in remove:
            text = re.sub(pat, " ", text)
        rx = re.compile("[A-Za-z]")
        return [word for word in text.split()
                if len(word) > 1 and rx.search(word)]

if __name__ == "__main__":
    import sys
    splitter = HTMLWordSplitter()
    for path in sys.argv[1:]:
        f = open(path, "rb")
        buf = f.read()
        f.close()
        print path
        print splitter.process(buf)