[Zope] Malicious HTML remover and HTML to Text converter
Andy McKay
andym@ActiveState.com
Wed, 4 Apr 2001 10:30:01 -0700
Thanks, very useful.
--
Andy McKay.
----- Original Message -----
From: "Chris Withers" <chrisw@nipltd.com>
To: "Farrell, Troy" <troy.farrell@wilcom.com>; "Steve Drees"
<drees@the-bridge.net>; <zope@zope.org>
Sent: Wednesday, April 04, 2001 9:54 AM
Subject: [Zope] Malicious HTML remover and HTML to Text converter
> Okay,
>
> Coupla people asked for this, so here goes:
>
> Use as follows:
>
> from stripogram import html2text, html2safehtml
>
> mylumpofdodgyhtml # a lump of dodgy html ;-)
>
> mylumpofcoolcleancollectedhtml =
> html2safehtml(mylumpofdodgyhtml,valid_tags=('b', 'a', 'i', 'br', 'p'))
>
> mylumpoftext = html2text(mylumpofcoolcleancollectedhtml)
>
> cheers,
>
> Chris
>
> PS: Patches to the parser used in html2text greatfully recieved ;-)
----------------------------------------------------------------------------
----
> __doc__ = """HTML filter thanks to Itamar Shtull-Trauring"""
>
> import sgmllib, string
>
> class HTML2Text(sgmllib.SGMLParser):
>
> from htmlentitydefs import entitydefs # replace entitydefs from
sgmllib
>
> def __init__(self):
> sgmllib.SGMLParser.__init__(self)
> self.result = ""
> self.indent = 0
> self.ol_number = 0
>
> def add_line(self,text,newline='\n'):
> self.result = self.result + self.indent*' ' + text + newline
>
> def mod_indent(self,i):
> self.indent = self.indent + i
> if self.indent < 0:
> self.indent = 0
>
> def handle_data(self, data):
> if data:
> map(self.add_line,string.split(string.strip(data),'\n'))
>
> def unknown_starttag(self, tag, attrs):
> """ Convert HTML to something meaningful in plain text """
> tag = string.lower(tag)
>
> if tag[0]=='h' or tag in ['br','pre','p','hr']:
> # insert a blank line
> self.add_line('')
>
> elif tag =='img':
> # newline, text, newline
> src = ''
>
> for k, v in attrs:
> if string.lower(k) == 'src':
> src = v
>
> self.add_line('')
> self.add_line('Image: %s' % src)
>
> elif tag =='li':
> self.add_line('')
> if self.ol_number:
> # num - text
> self.add_line('%s - ' % self.ol_number,'')
> self.ol_number = self.ol_number + 1
> else:
> # - text
> self.add_line('- ','')
>
> elif tag in ['dd','dt']:
> self.add_line('')
> # increase indent
> self.mod_indent(+1)
>
> elif tag in ['ul','dl','ol']:
> # blank line
> #self.add_line('')
> # increase indent
> self.mod_indent(+1)
> if tag=='ol':
> self.ol_number = 1
>
> def unknown_endtag(self, tag):
> """ Convert HTML to something meaningful in plain text """
> tag = string.lower(tag)
>
> if tag[0]=='h' or tag in ['pre']:
> # newline, text, newline
> self.add_line('')
>
> elif tag =='li':
> #self.add_line('')
> pass
>
> elif tag in ['dd','dt']:
> #self.add_line('')
> # descrease indent
> self.mod_indent(-1)
>
> elif tag in ['ul','dl','ol']:
> # blank line
> #self.add_line('')
> # decrease indent
> self.mod_indent(-1)
> self.ol_number = 0
>
> class StrippingParser(sgmllib.SGMLParser):
>
> from htmlentitydefs import entitydefs # replace entitydefs from
sgmllib
>
> def __init__(self):
> sgmllib.SGMLParser.__init__(self)
> self.result = ""
> self.endTagList = []
>
> def handle_data(self, data):
> if data:
> self.result = self.result + data
>
> def handle_charref(self, name):
> self.result = "%s&#%s;" % (self.result, name)
>
> def handle_entityref(self, name):
> if self.entitydefs.has_key(name):
> x = ';'
> else:
> # this breaks unstandard entities that end with ';'
> x = ''
> self.result = "%s&%s%s" % (self.result, name, x)
>
> def unknown_starttag(self, tag, attrs):
> """ Delete all tags except for legal ones """
> if string.lower(tag) in self.valid_tags:
> self.result = self.result + '<' + tag
> for k, v in attrs:
> if string.lower(k[0:2]) != 'on' and string.lower(v[0:10])
!= 'javascript':
> self.result = '%s %s="%s"' % (self.result, k, v)
> endTag = '</%s>' % tag
> self.endTagList.insert(0,endTag)
> self.result = self.result + '>'
>
> def unknown_endtag(self, tag):
> if string.lower(tag) in self.valid_tags:
> self.result = "%s</%s>" % (self.result, tag)
> remTag = '</%s>' % tag
> self.endTagList.remove(remTag)
>
> def cleanup(self):
> """ Append missing closing tags """
> for j in range(len(self.endTagList)):
> self.result = self.result + self.endTagList[j]
>
> def html2text(self,s):
> parser = striphtml.HTML2Text()
> parser.feed(s)
> parser.close()
> return parser.result
>
> def html2safehtml(s, valid_tags=('b', 'a', 'i', 'br', 'p')):
>
> parser = StrippingParser()
> parser.valid_tags = valid_tags
> parser.feed(s)
> parser.close()
> parser.cleanup()
> return parser.result
>