[CMF-checkins] CVS: CMF/CMFDefault - utils.py:1.13

Tres Seaver tseaver@zope.com
Fri, 19 Jul 2002 21:18:38 -0400


Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv25677/CMFDefault

Modified Files:
	utils.py 
Log Message:
 - Merge code hygeine, new splitter from 1.3 branch.

=== CMF/CMFDefault/utils.py 1.12 => 1.13 ===
 """
     Utility functions.
 """
-from string import split, join, lstrip, lower, strip, capitalize
 from sgmllib import SGMLParser
 import re
 import os
+
 from Globals import package_home
+
 _dtmldir = os.path.join( package_home( globals() ), 'dtml' )
 
 def formatRFC822Headers( headers ):
-    """
-        Convert the key-value pairs in 'headers' to valid RFC822-style
+
+    """ Convert the key-value pairs in 'headers' to valid RFC822-style
         headers, including adding leading whitespace to elements which
         contain newlines in order to preserve continuation-line semantics.
     """
@@ -20,14 +21,14 @@
     for key, value in headers:
 
         vallines = linesplit.split( value )
-        munged.append( '%s: %s' % ( key, join( vallines, '\r\n  ' ) ) )
+        munged.append( '%s: %s' % ( key, '\r\n  '.join( vallines ) ) )
 
-    return join( munged, '\r\n' )
+    return '\r\n'.join( munged )
 
 
-def parseHeadersBody( body, headers=None, rc=re.compile(r'\n|\r\n')):
-    """
-        Parse any leading 'RFC-822'-ish headers from an uploaded
+def parseHeadersBody( body, headers=None, rc=re.compile( r'\n|\r\n' ) ):
+
+    """ Parse any leading 'RFC-822'-ish headers from an uploaded
         document, returning a dictionary containing the headers
         and the stripped body.
 
@@ -70,122 +71,314 @@
         headers = headers.copy()
 
     hdrlist = []
+
     for line in lines:
-        if not strip(line):
+
+        if not line.strip():
             break
-        tokens = split( line, ': ' )
+
+        tokens = line.split( ': ' )
+
         if len( tokens ) > 1:
-            hdrlist.append( ( tokens[0], join( tokens[1:], ': ' ) ) )
+            hdrlist.append( ( tokens[0], ': '.join( tokens[1:] ) ) )
         elif i == 0:
             return headers, body     # no headers, just return those passed in.
         else:    # continuation
             last, hdrlist = hdrlist[ -1 ], hdrlist[ :-1 ]
             hdrlist.append( ( last[ 0 ]
-                            , join( ( last[1], lstrip( line ) ), '\n' )
+                            , '\n'.join( ( last[1], line.lstrip() ) )
                             ) )
         i = i + 1
 
     for hdr in hdrlist:
         headers[ hdr[0] ] = hdr[ 1 ]
 
-    return headers, join( lines[ i+1: ], '\n' )
+    return headers, '\n'.join( lines[ i+1: ] )
 
 
 def semi_split(s):
-    return map(strip, split(s, ';'))
+
+    """ Split 's' on semicolons.
+    """
+    return map(lambda x: x.strip(), s.split( ';' ) )
 
 def comma_split(s):
-    return map(strip, split(s, ','))
 
-def seq_strip (seq, stripper=strip):
-    if type(seq) == type([]):
-        return map ( stripper, seq)
-    if type(seq) == type(()):
-        #seq1 = list(seq)
-        return tuple (map(stripper, seq))
-    raise ValueError, "%s of unsupported sequencetype %s" % (seq, type(seq))
-
-def tuplize( valueName, value, splitter=split ):
-    if type(value) == type(()): return seq_strip( value )
-    if type(value) == type([]): return seq_strip( tuple( value ))
-    if type(value) == type(''): return seq_strip( tuple( splitter( value ) ))
+    """ Split 's' on commas.
+    """
+    return map(lambda x: x.strip(), s.split( ',') )
+
+def seq_strip (seq, stripper=lambda x: x.strip() ):
+
+    """ Strip a sequence of strings.
+    """
+    if type( seq ) == type( [] ):
+        return map( stripper, seq )
+
+    if type( seq ) == type( () ):
+        return tuple( map( stripper, seq ) )
+
+    raise ValueError, "%s of unsupported sequencetype %s" % ( seq, type( seq ) )
+
+def tuplize( valueName, value, splitter=lambda x: x.strip() ):
+
+    if type( value ) == type( () ):
+        return seq_strip( value )
+
+    if type( value ) == type( [] ):
+        return seq_strip( tuple( value ) )
+
+    if type( value ) == type( '' ):
+        return seq_strip( tuple( splitter( value ) ) )
+
     raise ValueError, "%s of unsupported type" % valueName
 
 
-class SimpleHTMLParser(SGMLParser):
+class SimpleHTMLParser( SGMLParser ):
+
     #from htmlentitydefs import entitydefs
 
-    def __init__(self, verbose=0):
-        SGMLParser.__init__(self, verbose)
+    def __init__( self, verbose=0 ):
+
+        SGMLParser.__init__( self, verbose )
         self.savedata = None
         self.title = ''
         self.metatags = {}
         self.body = ''
 
-    def handle_data(self, data):
+    def handle_data( self, data ):
+
         if self.savedata is not None:
             self.savedata = self.savedata + data
 
-    def handle_charref(self, ref):
-        self.handle_data("&#%s;" % ref)
+    def handle_charref( self, ref ):
+
+        self.handle_data( "&#%s;" % ref )
+
+    def handle_entityref( self, ref ):
+
+        self.handle_data( "&%s;" % ref )
 
-    def handle_entityref(self, ref):
-        self.handle_data("&%s;" % ref)
+    def save_bgn( self ):
 
-    def save_bgn(self):
         self.savedata = ''
 
-    def save_end(self):
+    def save_end( self ):
+
         data = self.savedata
         self.savedata = None
         return data
 
-        
-    def start_title(self, attrs):
+    def start_title( self, attrs ):
+
         self.save_bgn()
 
-    def end_title(self):
+    def end_title( self ):
+
         self.title = self.save_end()
 
-    def do_meta(self, attrs):
+    def do_meta( self, attrs ):
+
         name = ''
         content = ''
+
         for attrname, value in attrs:
-            value = strip(value)
-            if attrname == "name": name = capitalize(value)
-            if attrname == "content": content = value
+
+            value = value.strip()
+
+            if attrname == "name":
+                name = value.capitalize()
+
+            if attrname == "content":
+                content = value
+
         if name:
-            self.metatags[name] = content
+            self.metatags[ name ] = content
     
-    def unknown_startag(self, tag, attrs):
+    def unknown_startag( self, tag, attrs ):
+
         self.setliteral()
 
-    def unknown_endtag(self, tag):
+    def unknown_endtag( self, tag ):
+
         self.setliteral()
-    
 
-_bodyre = re.compile(r'^\s*<html.*<body.*?>', re.DOTALL|re.I)
-_endbodyre = re.compile(r'</body', re.DOTALL|re.I)
+#
+#   HTML cleaning code
+#
+
+# These are the HTML tags that we will leave intact
+VALID_TAGS = { 'a'          : 1
+             , 'b'          : 1
+             , 'base'       : 1
+             , 'blockquote' : 1
+             , 'body'       : 1
+             , 'br'         : 1
+             , 'caption'    : 1
+             , 'cite'       : 1
+             , 'code'       : 1
+             , 'div'        : 1
+             , 'dl'         : 1
+             , 'dt'         : 1
+             , 'dd'         : 1
+             , 'em'         : 1
+             , 'h1'         : 1
+             , 'h2'         : 1
+             , 'h3'         : 1
+             , 'h4'         : 1
+             , 'h5'         : 1
+             , 'h6'         : 1
+             , 'head'       : 1
+             , 'hr'         : 1
+             , 'html'       : 1
+             , 'i'          : 1
+             , 'img'        : 1
+             , 'kbd'        : 1
+             , 'li'         : 1
+           # , 'link'       : 1 type="script" hoses us
+             , 'meta'       : 1
+             , 'ol'         : 1
+             , 'p'          : 1
+             , 'pre'        : 1
+             , 'span'       : 1
+             , 'strong'     : 1
+             , 'table'      : 1
+             , 'tbody'      : 1
+             , 'td'         : 1
+             , 'th'         : 1
+             , 'title'      : 1
+             , 'tr'         : 1
+             , 'tt'         : 1
+             , 'ul'         : 1
+             }
+
+NASTY_TAGS = { 'script'     : 1
+             , 'object'     : 1
+             , 'embed'      : 1
+             , 'applet'     : 1
+             }
+
+class IllegalHTML( ValueError ):
+    pass
+
+class StrippingParser( SGMLParser ):
+
+    """ Pass only allowed tags;  raise exception for known-bad.
+    """
+
+    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
+
+    def __init__( self ):
+
+        SGMLParser.__init__( self )
+        self.result = ""
+
+    def handle_data( self, data ):
+
+        if data:
+            self.result = self.result + data
+
+    def handle_charref( self, name ):
+
+        self.result = "%s&#%s;" % ( self.result, name )
+
+    def handle_entityref(self, name):
+
+        if self.entitydefs.has_key(name):
+            x = ';'
+        else:
+            # this breaks unstandard entities that end with ';'
+            x = ''
+
+        self.result = "%s&%s%s" % (self.result, name, x)
+
+    def unknown_starttag(self, tag, attrs):
+
+        """ Delete all tags except for legal ones.
+        """
+        if VALID_TAGS.get( tag ):
+
+            self.result = self.result + '<' + tag
+
+            for k, v in attrs:
+
+                if k.lower().startswith( 'on' ):
+                    raise IllegalHTML, 'Javascipt event "%s" not allowed.' % k
+
+                if v.lower().startswith( 'javascript:' ):
+                    raise IllegalHTML, 'Javascipt URI "%s" not allowed.' % v
+
+                self.result = '%s %s="%s"' % (self.result, k, v)
+
+            endTag = '</%s>' % tag
+            self.result = self.result + '>'
+
+        elif NASTY_TAGS.get( tag ):
+            raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
+
+        else:
+            pass    # omit tag
+
+    def unknown_endtag(self, tag):
 
-def bodyfinder(text):
-    bod = _bodyre.search(text)
-    if not bod: return text
-
-    end = _endbodyre.search(text)
-    if not end: return text
-    else: return text[bod.end():end.start()]
-
-htfinder = re.compile(r'<html', re.DOTALL|re.I)
-def html_headcheck(html):
-    """ Returns 'true' if document looks HTML-ish enough """
+        if VALID_TAGS.get( tag ):
+
+            self.result = "%s</%s>" % (self.result, tag)
+            remTag = '</%s>' % tag
+
+def scrubHTML( html ):
+
+    """ Strip illegal HTML tags from string text.
+    """
+    parser = StrippingParser()
+    parser.feed( html )
+    parser.close()
+    return parser.result
+
+def isHTMLSafe( html ):
+
+    """ Would current HTML be permitted to be saved?
+    """
+    try:
+        scrubHTML( html )
+    except IllegalHTML:
+        return 0
+    else:
+        return 1
+
+_bodyre = re.compile( r'^\s*<html.*<body.*?>', re.DOTALL | re.I )
+
+_endbodyre = re.compile( r'</body', re.DOTALL | re.I )
+
+def bodyfinder( text ):
+
+    bod = _bodyre.search( text )
+    if not bod:
+        return text
+
+    end = _endbodyre.search( text )
+    if not end:
+        return text
+    else:
+        return text[bod.end():end.start()]
+
+htfinder = re.compile( r'<html', re.DOTALL | re.I )
+
+def html_headcheck( html ):
+
+    """ Return 'true' if document looks HTML-ish enough.
+    """
     if not htfinder.search(html):
         return 0
+
     lines = re.split(r'[\n\r]+?', html)
+
     for line in lines:
-        line = strip(line)
+        line = line.strip()
+
         if not line:
             continue
-        elif lower(line[:5]) == '<html':
+        elif line.lower().startswith( '<html' ):
             return 1
         elif line[0] != '<':
             return 0