Malicious HTML remover and HTML to Text converter
Okay, Coupla people asked for this, so here goes: Use as follows: from stripogram import html2text, html2safehtml mylumpofdodgyhtml # a lump of dodgy html ;-) mylumpofcoolcleancollectedhtml = html2safehtml(mylumpofdodgyhtml,valid_tags=('b', 'a', 'i', 'br', 'p')) mylumpoftext = html2text(mylumpofcoolcleancollectedhtml) cheers, Chris PS: Patches to the parser used in html2text greatfully recieved ;-) __doc__ = """HTML filter thanks to Itamar Shtull-Trauring""" import sgmllib, string class HTML2Text(sgmllib.SGMLParser): from htmlentitydefs import entitydefs # replace entitydefs from sgmllib def __init__(self): sgmllib.SGMLParser.__init__(self) self.result = "" self.indent = 0 self.ol_number = 0 def add_line(self,text,newline='\n'): self.result = self.result + self.indent*' ' + text + newline def mod_indent(self,i): self.indent = self.indent + i if self.indent < 0: self.indent = 0 def handle_data(self, data): if data: map(self.add_line,string.split(string.strip(data),'\n')) def unknown_starttag(self, tag, attrs): """ Convert HTML to something meaningful in plain text """ tag = string.lower(tag) if tag[0]=='h' or tag in ['br','pre','p','hr']: # insert a blank line self.add_line('') elif tag =='img': # newline, text, newline src = '' for k, v in attrs: if string.lower(k) == 'src': src = v self.add_line('') self.add_line('Image: %s' % src) elif tag =='li': self.add_line('') if self.ol_number: # num - text self.add_line('%s - ' % self.ol_number,'') self.ol_number = self.ol_number + 1 else: # - text self.add_line('- ','') elif tag in ['dd','dt']: self.add_line('') # increase indent self.mod_indent(+1) elif tag in ['ul','dl','ol']: # blank line #self.add_line('') # increase indent self.mod_indent(+1) if tag=='ol': self.ol_number = 1 def unknown_endtag(self, tag): """ Convert HTML to something meaningful in plain text """ tag = string.lower(tag) if tag[0]=='h' or tag in ['pre']: # newline, text, newline self.add_line('') elif tag =='li': #self.add_line('') pass elif tag in ['dd','dt']: #self.add_line('') # descrease indent self.mod_indent(-1) elif tag in ['ul','dl','ol']: # blank line #self.add_line('') # decrease indent self.mod_indent(-1) self.ol_number = 0 class StrippingParser(sgmllib.SGMLParser): from htmlentitydefs import entitydefs # replace entitydefs from sgmllib def __init__(self): sgmllib.SGMLParser.__init__(self) self.result = "" self.endTagList = [] def handle_data(self, data): if data: self.result = self.result + data def handle_charref(self, name): self.result = "%s%s;" % (self.result, name) def handle_entityref(self, name): if self.entitydefs.has_key(name): x = ';' else: # this breaks unstandard entities that end with ';' x = '' self.result = "%s&%s%s" % (self.result, name, x) def unknown_starttag(self, tag, attrs): """ Delete all tags except for legal ones """ if string.lower(tag) in self.valid_tags: self.result = self.result + '<' + tag for k, v in attrs: if string.lower(k[0:2]) != 'on' and string.lower(v[0:10]) != 'javascript': self.result = '%s %s="%s"' % (self.result, k, v) endTag = '</%s>' % tag self.endTagList.insert(0,endTag) self.result = self.result + '>' def unknown_endtag(self, tag): if string.lower(tag) in self.valid_tags: self.result = "%s</%s>" % (self.result, tag) remTag = '</%s>' % tag self.endTagList.remove(remTag) def cleanup(self): """ Append missing closing tags """ for j in range(len(self.endTagList)): self.result = self.result + self.endTagList[j] def html2text(self,s): parser = striphtml.HTML2Text() parser.feed(s) parser.close() return parser.result def html2safehtml(s, valid_tags=('b', 'a', 'i', 'br', 'p')): parser = StrippingParser() parser.valid_tags = valid_tags parser.feed(s) parser.close() parser.cleanup() return parser.result
Thanks, very useful. -- Andy McKay. ----- Original Message ----- From: "Chris Withers" <chrisw@nipltd.com> To: "Farrell, Troy" <troy.farrell@wilcom.com>; "Steve Drees" <drees@the-bridge.net>; <zope@zope.org> Sent: Wednesday, April 04, 2001 9:54 AM Subject: [Zope] Malicious HTML remover and HTML to Text converter
Okay,
Coupla people asked for this, so here goes:
Use as follows:
from stripogram import html2text, html2safehtml
mylumpofdodgyhtml # a lump of dodgy html ;-)
mylumpofcoolcleancollectedhtml = html2safehtml(mylumpofdodgyhtml,valid_tags=('b', 'a', 'i', 'br', 'p'))
mylumpoftext = html2text(mylumpofcoolcleancollectedhtml)
cheers,
Chris
PS: Patches to the parser used in html2text greatfully recieved ;-)
---------------------------------------------------------------------------- ----
__doc__ = """HTML filter thanks to Itamar Shtull-Trauring"""
import sgmllib, string
class HTML2Text(sgmllib.SGMLParser):
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self): sgmllib.SGMLParser.__init__(self) self.result = "" self.indent = 0 self.ol_number = 0
def add_line(self,text,newline='\n'): self.result = self.result + self.indent*' ' + text + newline
def mod_indent(self,i): self.indent = self.indent + i if self.indent < 0: self.indent = 0
def handle_data(self, data): if data: map(self.add_line,string.split(string.strip(data),'\n'))
def unknown_starttag(self, tag, attrs): """ Convert HTML to something meaningful in plain text """ tag = string.lower(tag)
if tag[0]=='h' or tag in ['br','pre','p','hr']: # insert a blank line self.add_line('')
elif tag =='img': # newline, text, newline src = ''
for k, v in attrs: if string.lower(k) == 'src': src = v
self.add_line('') self.add_line('Image: %s' % src)
elif tag =='li': self.add_line('') if self.ol_number: # num - text self.add_line('%s - ' % self.ol_number,'') self.ol_number = self.ol_number + 1 else: # - text self.add_line('- ','')
elif tag in ['dd','dt']: self.add_line('') # increase indent self.mod_indent(+1)
elif tag in ['ul','dl','ol']: # blank line #self.add_line('') # increase indent self.mod_indent(+1) if tag=='ol': self.ol_number = 1
def unknown_endtag(self, tag): """ Convert HTML to something meaningful in plain text """ tag = string.lower(tag)
if tag[0]=='h' or tag in ['pre']: # newline, text, newline self.add_line('')
elif tag =='li': #self.add_line('') pass
elif tag in ['dd','dt']: #self.add_line('') # descrease indent self.mod_indent(-1)
elif tag in ['ul','dl','ol']: # blank line #self.add_line('') # decrease indent self.mod_indent(-1) self.ol_number = 0
class StrippingParser(sgmllib.SGMLParser):
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self): sgmllib.SGMLParser.__init__(self) self.result = "" self.endTagList = []
def handle_data(self, data): if data: self.result = self.result + data
def handle_charref(self, name): self.result = "%s%s;" % (self.result, name)
def handle_entityref(self, name): if self.entitydefs.has_key(name): x = ';' else: # this breaks unstandard entities that end with ';' x = '' self.result = "%s&%s%s" % (self.result, name, x)
def unknown_starttag(self, tag, attrs): """ Delete all tags except for legal ones """ if string.lower(tag) in self.valid_tags: self.result = self.result + '<' + tag for k, v in attrs: if string.lower(k[0:2]) != 'on' and string.lower(v[0:10]) != 'javascript': self.result = '%s %s="%s"' % (self.result, k, v) endTag = '</%s>' % tag self.endTagList.insert(0,endTag) self.result = self.result + '>'
def unknown_endtag(self, tag): if string.lower(tag) in self.valid_tags: self.result = "%s</%s>" % (self.result, tag) remTag = '</%s>' % tag self.endTagList.remove(remTag)
def cleanup(self): """ Append missing closing tags """ for j in range(len(self.endTagList)): self.result = self.result + self.endTagList[j]
def html2text(self,s): parser = striphtml.HTML2Text() parser.feed(s) parser.close() return parser.result
def html2safehtml(s, valid_tags=('b', 'a', 'i', 'br', 'p')):
parser = StrippingParser() parser.valid_tags = valid_tags parser.feed(s) parser.close() parser.cleanup() return parser.result
participants (2)
-
Andy McKay -
Chris Withers