[CMF-checkins] CVS: CMF/CMFDefault - utils.py:1.17
Yvo Schubbe
schubbe@web.de
Wed, 5 Feb 2003 12:51:41 -0500
Update of /cvs-repository/CMF/CMFDefault
In directory cvs.zope.org:/tmp/cvs-serv4163/CMFDefault
Modified Files:
utils.py
Log Message:
Merged yuppie-collector041-branch:
- Changed behavior of bodyfinder and html_headcheck.
- Fixed header stripping in edit and PUT. (Collector #41)
=== CMF/CMFDefault/utils.py 1.16 => 1.17 ===
--- CMF/CMFDefault/utils.py:1.16 Thu Dec 19 00:34:55 2002
+++ CMF/CMFDefault/utils.py Wed Feb 5 12:51:39 2003
@@ -1,6 +1,21 @@
+##############################################################################
+#
+# Copyright (c) 2001-2003 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+""" Utility functions.
+
+$Id$
"""
- Utility functions.
-"""
+
from sgmllib import SGMLParser
import re
import os
@@ -359,44 +374,38 @@
else:
return 1
-security.declarePrivate('_bodyre')
-_bodyre = re.compile( r'^\s*<html.*<body.*?>', re.DOTALL | re.I )
-
-security.declarePrivate('_endbodyre')
-_endbodyre = re.compile( r'</body', re.DOTALL | re.I )
-
security.declarePublic('bodyfinder')
-def bodyfinder( text ):
+def bodyfinder(text):
+ """ Return body or unchanged text if no body tags found.
- bod = _bodyre.search( text )
- if not bod:
+ Always use html_headcheck() first.
+ """
+ lowertext = text.lower()
+ bodystart = lowertext.find('<body')
+ if bodystart == -1:
return text
-
- end = _endbodyre.search( text )
- if not end:
+ bodystart = lowertext.find('>', bodystart) + 1
+ if bodystart == 0:
return text
- else:
- return text[bod.end():end.start()]
+ bodyend = lowertext.rfind('</body>', bodystart)
+ if bodyend == -1:
+ return text
+ return text[bodystart:bodyend]
security.declarePrivate('_htfinder')
-_htfinder = re.compile( r'<html', re.DOTALL | re.I )
+_htfinder = re.compile(r'(\s|(<[^<>]*?>))*<html.*<body.*?>.*</body>',
+ re.DOTALL)
security.declarePublic('html_headcheck')
-def html_headcheck( html ):
-
+def html_headcheck(html):
""" Return 'true' if document looks HTML-ish enough.
+
+ If true bodyfinder() will be able to find the HTML body.
"""
- if not _htfinder.search(html):
+ lowerhtml = html.lower()
+ if lowerhtml.find('<html') == -1:
+ return 0
+ elif _htfinder.match(lowerhtml):
+ return 1
+ else:
return 0
-
- lines = re.split(r'[\n\r]+?', html)
-
- for line in lines:
- line = line.strip()
-
- if not line:
- continue
- elif line.lower().startswith( '<html' ):
- return 1
- elif line[0] != '<':
- return 0