[CMF-checkins] CVS: CMF/CMFDefault - utils.py:1.10.8.1

Chris Withers chrisw@nipltd.com
Fri, 15 Feb 2002 12:34:08 -0500


Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv10709

Added Files:
      Tag: ChrisW-refactor_tests-branch
	utils.py 
Log Message:
huh?!

=== Added File CMF/CMFDefault/utils.py ===
"""
    Utility functions.
"""
from string import split, join, lstrip, lower, strip, capitalize
from sgmllib import SGMLParser
import re
import os
from Globals import package_home
_dtmldir = os.path.join( package_home( globals() ), 'dtml' )

def formatRFC822Headers( headers ):
    """
        Convert the key-value pairs in 'headers' to valid RFC822-style
        headers, including adding leading whitespace to elements which
        contain newlines in order to preserve continuation-line semantics.
    """
    munged = []
    linesplit = re.compile( r'[\n\r]+?' )

    for key, value in headers:

        vallines = linesplit.split( value )
        munged.append( '%s: %s' % ( key, join( vallines, '\r\n  ' ) ) )

    return join( munged, '\r\n' )


def parseHeadersBody( body, headers=None, rc=re.compile(r'\n|\r\n')):
    """
        Parse any leading 'RFC-822'-ish headers from an uploaded
        document, returning a dictionary containing the headers
        and the stripped body.

        E.g.::

            Title: Some title
            Creator: Tres Seaver
            Format: text/plain
            X-Text-Format: structured

            Overview

            This document .....

            First Section

            ....


        would be returned as::

            { 'Title' : 'Some title'
            , 'Creator' : 'Tres Seaver'
            , 'Format' : 'text/plain'
            , 'text_format': 'structured'
            }

        as the headers, plus the body, starting with 'Overview' as
        the first line (the intervening blank line is a separator).

        Allow passing initial dictionary as headers.
    """
    # Split the lines apart, taking into account Mac|Unix|Windows endings
    lines = rc.split(body)

    i = 0
    if headers is None:
        headers = {}
    else:
        headers = headers.copy()

    hdrlist = []
    for line in lines:
        if not strip(line):
            break
        tokens = split( line, ': ' )
        if len( tokens ) > 1:
            hdrlist.append( ( tokens[0], join( tokens[1:], ': ' ) ) )
        elif i == 0:
            return headers, body     # no headers, just return those passed in.
        else:    # continuation
            last, hdrlist = hdrlist[ -1 ], hdrlist[ :-1 ]
            hdrlist.append( ( last[ 0 ]
                            , join( ( last[1], lstrip( line ) ), '\n' )
                            ) )
        i = i + 1

    for hdr in hdrlist:
        headers[ hdr[0] ] = hdr[ 1 ]

    return headers, join( lines[ i+1: ], '\n' )


def semi_split(s):
    return map(strip, split(s, ';'))

def comma_split(s):
    return map(strip, split(s, ','))

def tuplize( valueName, value, splitter=split ):
    if type(value) == type(()): return value
    if type(value) == type([]): return tuple( value )
    if type(value) == type(''): return tuple( splitter( value ) )
    raise ValueError, "%s of unsupported type" % valueName


class SimpleHTMLParser(SGMLParser):
    #from htmlentitydefs import entitydefs

    def __init__(self, verbose=0):
        SGMLParser.__init__(self, verbose)
        self.savedata = None
        self.title = ''
        self.metatags = {}
        self.body = ''

    def handle_data(self, data):
        if self.savedata is not None:
            self.savedata = self.savedata + data

    def handle_charref(self, ref):
        self.handle_data("&#%s;" % ref)

    def handle_entityref(self, ref):
        self.handle_data("&%s;" % ref)

    def save_bgn(self):
        self.savedata = ''

    def save_end(self):
        data = self.savedata
        self.savedata = None
        return data

        
    def start_title(self, attrs):
        self.save_bgn()

    def end_title(self):
        self.title = self.save_end()

    def do_meta(self, attrs):
        name = ''
        content = ''
        for attrname, value in attrs:
            value = strip(value)
            if attrname == "name": name = capitalize(value)
            if attrname == "content": content = value
        if name:
            self.metatags[name] = content
    
    def unknown_startag(self, tag, attrs):
        self.setliteral()

    def unknown_endtag(self, tag):
        self.setliteral()
    

_bodyre = re.compile(r'<body.*?>', re.DOTALL|re.I)
_endbodyre = re.compile(r'</body', re.DOTALL|re.I)

def bodyfinder(text):
    bod = _bodyre.search(text)
    if not bod: return text

    end = _endbodyre.search(text)
    if not end: return text
    else: return text[bod.end():end.start()]

htfinder = re.compile(r'<html', re.DOTALL|re.I)
def html_headcheck(html):
    """ Returns 'true' if document looks HTML-ish enough """
    if not htfinder.search(html):
        return 0
    lines = re.split(r'[\n\r]+?', html)
    for line in lines:
        line = strip(line)
        if not line:
            continue
        elif lower(line[:5]) == '<html':
            return 1
        elif line[0] != '<':
            return 0