[CMF-checkins] CVS: CMF/CMFCore - TextFilters.py:1.1.2.1
Tres Seaver
tseaver@zope.com
Wed, 19 Dec 2001 19:18:45 -0500
Update of /cvs-repository/CMF/CMFCore
In directory cvs.zope.org:/tmp/cvs-serv19344/CMFCore
Added Files:
Tag: tseaver-texthandler-branch
TextFilters.py
Log Message:
- Added 'portal_textmanager', a registry for filters which can be
used both to munge inbound text content (e.g., decapitating
HTML, extracting RFC822-style STX headers, etc.) and to render
outbout content (e.g., rendering STX to HTML). Implemented
several "stock" handlers.
=== Added File CMF/CMFCore/TextFilters.py ===
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""
Common text filters.
"""
from Products.CMFCore.interfaces.portal_textmanager import TextFilter, TextInfo
from UserDict import UserDict
import re
class TextInfoImpl( UserDict ):
"""
Hold on to a chunk of text, plus additional data about the
text, as munged by one or more TextFilter implementations.
"""
__implements__ = TextInfo
_text = ''
def getText( self ):
return self._text
def setText( self, text ):
self._text = text
__call__ = getText
class PassthroughFilter:
"""
No-op filter; just makes sure that next-in-line gets a true
TextInfo.
"""
__implements__ = TextFilter
def filterText( self, text_info='' ):
return _ensureTextInfo( text_info )
__call__ = filterText
class HTMLDecapitator:
"""
Strip everything outside of <body> from an HTML document;
stash data from <head> contents in extra data.
"""
__implements__ = TextFilter
def _processMetadata( self
, parser
, splitter=re.compile( r'[, ]+' )
):
"""
Post-process metadata extracted from <meta> tags to match
DublinCore.
"""
result = {}
result[ 'Title' ] = parser.title or ''
for k, v in parser.metatags.items():
if k == 'Keywords':
k = 'Subject'
if k in ( 'Subject', 'Contributors' ):
v = splitter.split( v )
result[ k ] = v
return result
def filterText( self, text_info='' ):
text = _ensureTextInfo( text_info )()
result = _makeTextInfo( _bodyfinder( text ) )
parser = _SimpleHTMLParser()
parser.feed( text )
result[ 'metadata' ] = self._processMetadata( parser )
return result
__call__ = filterText
class STXDecapitator:
"""
Strip leading RFC822-style metadata headers from a StructuredText
document; stash metadata in extra data.
"""
__implements__ = TextFilter
def filterText( self, text_info='' ):
text = _ensureTextInfo( text_info )()
headers, body = _parseSTXHeadersBody( text )
result = _makeTextInfo( body )
result[ 'metadata' ] = headers
return result
__call__ = filterText
#
# Helper functions & classes
#
from sgmllib import SGMLParser
from string import join, split, capitalize # XXX: WAAAA! 2.3 compatibility
def _makeTextInfo( text_or_info ):
"""
Create and return a TextInfoImpl instance using 'text_or_info'.
"""
result = TextInfoImpl()
if TextInfo.isImplementedBy( text_or_info ):
result.update( text_or_info )
result.setText( text_or_info() )
elif type( text_or_info ) in ( type( '' ), type( u'' ) ):
result.setText( text_or_info )
elif type( text_or_info ) is type( {} ):
for k,v in filter( lambda x: x[0] != 'text', text_or_info.items() ):
result[ k ] = v
result.setText( text_or_info.get( 'text', '' ) )
return result
def _ensureTextInfo( text_or_info ):
"""
Guarantee that 'text_or_info' is a TextInfo (force it by creating
a new instance, if necessary).
"""
if TextInfo.isImplementedBy( text_or_info ):
return text_or_info
return _makeTextInfo( text_or_info )
class _SimpleHTMLParser(SGMLParser):
"""
Parse off header tags from an HTML document, collecting the
data from the tags into attributes.
TODO: Capture <style>, <script>? (Why, for content?)
"""
def __init__(self, verbose=0):
SGMLParser.__init__(self, verbose)
self.savedata = None
self.title = ''
self.metatags = {}
self.body = ''
self.links = []
def handle_data(self, data):
if self.savedata is not None:
self.savedata = self.savedata + data
def handle_charref(self, ref):
self.handle_data("&#%s;" % ref)
def handle_entityref(self, ref):
self.handle_data("&%s;" % ref)
def save_bgn(self):
self.savedata = ''
def save_end(self):
data = self.savedata
self.savedata = None
return data
def start_title(self, attrs):
self.save_bgn()
def end_title(self):
self.title = self.save_end()
def do_meta(self, attrs):
name = ''
content = ''
for attrname, value in attrs:
value = value.strip()
if attrname == "name": name = capitalize( value )
if attrname == "content": content = value
if name:
self.metatags[name] = content
def do_link( self, attrs ):
link_data = {}
for k,v in attrs:
link_data[ k ] = v
self.links.append( link_data )
def unknown_startag(self, tag, attrs):
self.setliteral()
def unknown_endtag(self, tag):
self.setliteral()
def _bodyfinder( text
, BODYSTART = re.compile(r'<body.*?>', re.DOTALL|re.I)
, BODYEND = re.compile(r'</body', re.DOTALL|re.I)
):
"""
Return only the portion of 'text' which lies between the
<body> and </body> tags; if either is missing, return the
whole thing.
"""
body_start = BODYSTART.search(text)
if not body_start:
return text
body_end = BODYEND.search(text)
if not body_end:
return text
return text[ body_start.end() : body_end.start() ]
def _parseSTXHeadersBody( body
, headers=None
, LINE_SPLIT=re.compile( r'[\n\r]+?' )
, COLON_SPLIT=re.compile( r':[ ]*' )
):
"""
Parse any leading 'RFC-822'-ish headers from an uploaded
document, returning a dictionary containing the headers
and the stripped body.
E.g.::
Title: Some title
Creator: Tres Seaver
Format: text/plain
X-Text-Format: structured
Overview
This document .....
First Section
....
would be returned as::
{ 'Title' : 'Some title'
, 'Creator' : 'Tres Seaver'
, 'Format' : 'text/plain'
, 'text_format': 'structured'
}
as the headers, plus the body, starting with 'Overview' as
the first line (the intervening blank line is a separator).
Allow passing initial dictionary as headers.
"""
# Split the lines apart, taking into account Mac|Unix|Windows endings
lines = LINE_SPLIT.split( body )
i = 0
if headers is None:
headers = {}
else:
headers = headers.copy()
hdrlist = []
for line in lines:
if line and line[-1] == '\r':
line = line[:-1]
if not line:
break
tokens = COLON_SPLIT.split( line )
if len( tokens ) > 1:
hdrlist.append( ( capitalize( tokens[0] )
, join( tokens[1:], ': ' )
) )
elif i == 0:
return headers, body # no headers, just return those passed in.
else: # continuation
last, hdrlist = hdrlist[ -1 ], hdrlist[ :-1 ]
hdrlist.append( ( last[ 0 ]
, join( ( last[1], lstrip( line ) ), '\n' )
) )
i = i + 1
for hdr in hdrlist:
headers[ hdr[0] ] = hdr[ 1 ]
return headers, join( lines[ i+1: ], '\n' )