[Zope3-checkins] SVN: Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/ Simple HTML Fragements encoding sniffer

Julien Anguenot ja at nuxeo.com
Sat Oct 15 01:03:49 EDT 2005


Log message for revision 39460:
  Simple HTML Fragements encoding sniffer

Changed:
  A   Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/encodingsniffer.py
  A   Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/tests/test_encodingsniffer.py

-=-
Added: Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/encodingsniffer.py
===================================================================
--- Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/encodingsniffer.py	2005-10-15 03:42:02 UTC (rev 39459)
+++ Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/encodingsniffer.py	2005-10-15 05:03:48 UTC (rev 39460)
@@ -0,0 +1,58 @@
+##############################################################################
+#
+# Copyright (c) 2005 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""HTML fragement encoding sniffer
+
+$Id$
+"""
+ 
+from HTMLParser import HTMLParser 
+from HTMLParser import HTMLParseError
+
+class EncodingFound(Exception):
+    # This exception is throwned by the parser when a meta tag with
+    # charset is found. The value attribute holds the charset.
+    def __init__(self, value):
+        self.value = value
+
+class EncodingParser(HTMLParser):
+    """Encoding Parser for HTML fragments
+    """
+    def handle_starttag(self, tag, attrs):
+        # This method is called to handle the start of a tag If it founds
+        # a meta tag with charst information it raises an EncodingFound
+        # exception holding the charset
+        if tag != 'meta':
+            return
+        for attr, value in attrs:
+            if (attr == 'content' and
+                'charset' in value):
+                try:
+                    charset = value.split(';')[1].split('=')[1]
+                except IndexError:
+                    pass
+                else:
+                    raise EncodingFound(charset)
+
+def sniff_encoding(data):
+    """Try to sniff the encoding of an HTML fragment by checking the
+    meta tag and the charset information
+    """
+    parser = EncodingParser()
+    try:
+        parser.feed(data)
+    except EncodingFound, e:
+        return e.value.strip()
+    except HTMLParseError:
+        pass
+    return ''


Property changes on: Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/encodingsniffer.py
___________________________________________________________________
Name: svn:keywords
   + Id

Added: Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/tests/test_encodingsniffer.py
===================================================================
--- Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/tests/test_encodingsniffer.py	2005-10-15 03:42:02 UTC (rev 39459)
+++ Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/tests/test_encodingsniffer.py	2005-10-15 05:03:48 UTC (rev 39460)
@@ -0,0 +1,48 @@
+##############################################################################
+#
+# Copyright (c) 2005 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Test the HTML fragement encoding sniffer
+
+$Id$
+"""
+
+import sys
+import unittest
+
+from zope.tal.encodingsniffer import sniff_encoding
+
+class EncodingSnifferTestCase(unittest.TestCase):
+
+    def test_ascii_html_fragment_no_encoding(self):
+        str_ = """<p tal:content="python:u'déjà-vu'">para</p>"""
+        self.failIf(sniff_encoding(str_))
+
+    def test_ascii_html_fragment_with_encoding(self):
+        str_ = """<meta http-equiv='Content-type' content='text/html; charset=ISO-8859-15'><p tal:content="python:u'déjà-vu'">para</p>"""
+        self.assertEqual(sniff_encoding(str_), 'ISO-8859-15')
+
+    def test_ascii_html_fragment_with_encoding_and_whispace(self):
+        str_ = """<meta http-equiv='Content-type' content='text/html; charset= ISO-8859-15 '><p tal:content="python:u'déjà-vu'">para</p>"""
+        self.assertEqual(sniff_encoding(str_), 'ISO-8859-15')
+
+    def test_ascii_html_fragment_with_encoding_and_ligne_break(self):
+        str_ = """<meta http-equiv='Content-type' content='text/html;\n charset= ISO-8859-15 '><p tal:content="python:u'déjà-vu'">para</p>"""
+        self.assertEqual(sniff_encoding(str_), 'ISO-8859-15')
+
+def test_suite():
+    return unittest.TestSuite((
+        unittest.makeSuite(EncodingSnifferTestCase),
+        ))
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')


Property changes on: Zope3/branches/fdrake-anguenot_better_xml_support_for_pt/src/zope/tal/tests/test_encodingsniffer.py
___________________________________________________________________
Name: svn:keywords
   + Id



More information about the Zope3-Checkins mailing list