[Checkins] SVN: z3c.etestbrowser/trunk/src/z3c/etestbrowser/ Added
workaround for libxml2 HTML fallback behaviour that guesses
the wrong
Christian Theune
ct at gocept.com
Tue Sep 11 09:56:10 EDT 2007
Log message for revision 79569:
Added workaround for libxml2 HTML fallback behaviour that guesses the wrong
encoding.
Changed:
U z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt
U z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml
A z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
U z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py
-=-
Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt 2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/README.txt 2007-09-11 13:56:09 UTC (rev 79569)
@@ -51,3 +51,15 @@
>>> browser.etree.xpath(
... '//html:body', {'html': 'http://www.w3.org/1999/xhtml'})
[<Element {http://www.w3.org/1999/xhtml}body at ...>]
+
+LXML unicode support
+====================
+
+A couple of variations of libxml2 might interpret UTF-8 encoded strings
+incorrectly. We have a workaround for that. Let's have a look at a view that
+contains a German umlaut:
+
+ >>> browser.xml_strict = False
+ >>> browser.open('http://localhost/lxml.html')
+ >>> browser.etree.xpath("//span")[0].text
+ u'K\xfcgelblitz.'
Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml 2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/ftesting.zcml 2007-09-11 13:56:09 UTC (rev 79569)
@@ -1,4 +1,6 @@
-<configure xmlns="http://namespaces.zope.org/zope">
+<configure xmlns="http://namespaces.zope.org/zope"
+ xmlns:browser="http://namespaces.zope.org/browser"
+ i18n_domain="zope">
<include package="zope.app.zcmlfiles" file="meta.zcml"/>
<include package="zope.app.zcmlfiles" />
@@ -19,5 +21,13 @@
description="All users have this role implicitly" />
<grant permission="zope.View"
- role="zope.Anonymous" />
+ role="zope.Anonymous" />
+
+ <browser:page
+ name="lxml.html"
+ for="*"
+ template="lxml.pt"
+ permission="zope.View"
+ />
+
</configure>
Added: z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt (rev 0)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt 2007-09-11 13:56:09 UTC (rev 79569)
@@ -0,0 +1,8 @@
+<html>
+ <head>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8"/>
+ </head>
+ <body>
+ <span>Kügelblitz.</span>
+ </body>
+</html>
Property changes on: z3c.etestbrowser/trunk/src/z3c/etestbrowser/lxml.pt
___________________________________________________________________
Name: svn:eol-style
+ native
Modified: z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py
===================================================================
--- z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py 2007-09-11 12:59:52 UTC (rev 79568)
+++ z3c.etestbrowser/trunk/src/z3c/etestbrowser/testing.py 2007-09-11 13:56:09 UTC (rev 79569)
@@ -16,13 +16,14 @@
$Id$
"""
+import re
import StringIO
import lxml.etree
import zope.testbrowser.testing
-html_parser = lxml.etree.HTMLParser()
+RE_CHARSET = re.compile('.*;charset=(.*)')
class ExtendedTestBrowser(zope.testbrowser.testing.Browser):
@@ -45,10 +46,20 @@
# I'm not using any internal knowledge about testbrowser
# here, to avoid breakage. Memory usage won't be a problem.
if self.xml_strict:
- parser = None
+ self._etree = lxml.etree.XML(self.contents)
else:
- parser = html_parser
- self._etree = lxml.etree.XML(self.contents, parser)
+ # This is a workaround against the broken fallback for
+ # encoding detection of libxml2.
+ # We have a chance of knowing the encoding as Zope states this in
+ # the content-type response header.
+ content = self.contents
+ content_type = self.headers['content-type']
+ match = RE_CHARSET.match(content_type)
+ if match is not None:
+ charset = match.groups()[0]
+ content = content.decode(charset)
+ self._etree = lxml.etree.HTML(content)
+
return self._etree
def _changed(self):
More information about the Checkins
mailing list