[Zope3-checkins] SVN: Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/ upgrade to BeautifulSoup 3 and change tests accordingly (unicode everywhere)

Mon Aug 14 09:33:14 EDT 2006

Log message for revision 69468:
  upgrade to BeautifulSoup 3 and change tests accordingly (unicode everywhere)
  

Changed:
  U   Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/BeautifulSoup.py
  U   Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/zope/testbrowser/real/real.txt

-=-
Modified: Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/BeautifulSoup.py
===================================================================

--- Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/BeautifulSoup.py	2006-08-14 13:14:44 UTC (rev 69467)
+++ Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/BeautifulSoup.py	2006-08-14 13:33:13 UTC (rev 69468)
@@ -1,206 +1,304 @@
 """Beautiful Soup
 Elixir and Tonic
 "The Screen-Scraper's Friend"
-v2.1.1
 http://www.crummy.com/software/BeautifulSoup/
 
-Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
-into a tree representation. It provides methods and Pythonic idioms
-that make it easy to search and modify the tree.
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
 
-A well-formed XML/HTML document will yield a well-formed data
-structure. An ill-formed XML/HTML document will yield a
-correspondingly ill-formed data structure. If your document is only
-locally well-formed, you can use this library to find and process the
-well-formed part of it. The BeautifulSoup class has heuristics for
-obtaining a sensible parse tree in the face of common HTML errors.
+A well-structured XML/HTML document yields a well-behaved data
+structure. An ill-structured XML/HTML document yields a
+correspondingly ill-behaved data structure. If your document is only
+locally well-structured, you can use this library to find and process
+the well-structured part of it.
 
-Beautiful Soup has no external dependencies. It works with Python 2.2
-and up.
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
 
-Beautiful Soup defines classes for four different parsing strategies:
+* chardet, for auto-detecting character encodings
+  http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+  by stock Python.
+  http://cjkpython.i18n.org/
 
+Beautiful Soup defines classes for two main parsing strategies:
+    
  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
    language that kind of looks like XML.
 
  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
-   or invalid.
+   or invalid. This class has web browser-like heuristics for
+   obtaining a sensible parse tree in the face of common HTML errors.
 
- * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
-   that trips up BeautifulSoup.
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
+Parser.
 
- * BeautifulSOAP, for making it easier to parse XML documents that use
-   lots of subelements containing a single string, where you'd prefer
-   they put that string into an attribute (such as SOAP messages).
-
-You can subclass BeautifulStoneSoup or BeautifulSoup to create a
-parsing strategy specific to an XML schema or a particular bizarre
-HTML document. Typically your subclass would just override
-SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
 """
 from __future__ import generators
 
-__author__ = "Leonard Richardson (leonardr at segfault.org)"
-__version__ = "2.1.1"
-__date__ = "$Date: 2004/10/18 00:14:20 $"
-__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
+__author__ = "Leonard Richardson (crummy.com)"
+__contributors__ = ["Sam Ruby (intertwingly.net)",
+                    "the unwitting Mark Pilgrim (diveintomark.org)",
+                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
+__version__ = "3.0.3"
+__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
 __license__ = "PSF"
 
 from sgmllib import SGMLParser, SGMLParseError
+import codecs
 import types
 import re
 import sgmllib
+from htmlentitydefs import name2codepoint
 
-#This code makes Beautiful Soup able to parse XML with namespaces
+# This RE makes Beautiful Soup able to parse XML with namespaces.
 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 
-class NullType(object):
+# This RE makes Beautiful Soup capable of recognizing numeric character
+# references that use hexadecimal.
+sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
 
-    """Similar to NoneType with a corresponding singleton instance
-    'Null' that, unlike None, accepts any message and returns itself.
+DEFAULT_OUTPUT_ENCODING = "utf-8"
 
-    Examples:
-    >>> Null("send", "a", "message")("and one more",
-    ...      "and what you get still") is Null
-    True
-    """
+# First, the classes that represent markup elements.
 
-    def __new__(cls):                    return Null
-    def __call__(self, *args, **kwargs): return Null
-##    def __getstate__(self, *args):       return Null
-    def __getattr__(self, attr):         return Null
-    def __getitem__(self, item):         return Null
-    def __setattr__(self, attr, value):  pass
-    def __setitem__(self, item, value):  pass
-    def __len__(self):                   return 0
-    # FIXME: is this a python bug? otherwise ``for x in Null: pass``
-    #        never terminates...
-    def __iter__(self):                  return iter([])
-    def __contains__(self, item):        return False
-    def __repr__(self):                  return "Null"
-Null = object.__new__(NullType)
-
 class PageElement:
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 
-    def setup(self, parent=Null, previous=Null):
+    def setup(self, parent=None, previous=None):
         """Sets up the initial relations between this element and
-        other elements."""
+        other elements."""        
         self.parent = parent
         self.previous = previous
-        self.next = Null
-        self.previousSibling = Null
-        self.nextSibling = Null
+        self.next = None
+        self.previousSibling = None
+        self.nextSibling = None
         if self.parent and self.parent.contents:
             self.previousSibling = self.parent.contents[-1]
             self.previousSibling.nextSibling = self
 
-    def findNext(self, name=None, attrs={}, text=None):
+    def replaceWith(self, replaceWith):        
+        oldParent = self.parent
+        myIndex = self.parent.contents.index(self)
+        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+            # We're replacing this element with one of its siblings.
+            index = self.parent.contents.index(replaceWith)
+            if index and index < myIndex:
+                # Furthermore, it comes before this element. That
+                # means that when we extract it, the index of this
+                # element will change.
+                myIndex = myIndex - 1
+        self.extract()        
+        oldParent.insert(myIndex, replaceWith)
+        
+    def extract(self):
+        """Destructively rips this element out of the tree."""        
+        if self.parent:
+            try:
+                self.parent.contents.remove(self)
+            except ValueError:
+                pass
+
+        #Find the two elements that would be next to each other if
+        #this element (and any children) hadn't been parsed. Connect
+        #the two.        
+        lastChild = self._lastRecursiveChild()
+        nextElement = lastChild.next
+
+        if self.previous:
+            self.previous.next = nextElement
+        if nextElement:
+            nextElement.previous = self.previous
+        self.previous = None
+        lastChild.next = None
+
+        self.parent = None        
+        if self.previousSibling:
+            self.previousSibling.nextSibling = self.nextSibling
+        if self.nextSibling:
+            self.nextSibling.previousSibling = self.previousSibling
+        self.previousSibling = self.nextSibling = None       
+
+    def _lastRecursiveChild(self):
+        "Finds the last element beneath this object to be parsed."
+        lastChild = self
+        while hasattr(lastChild, 'contents') and lastChild.contents:
+            lastChild = lastChild.contents[-1]
+        return lastChild
+
+    def insert(self, position, newChild):
+        if (isinstance(newChild, basestring)
+            or isinstance(newChild, unicode)) \
+            and not isinstance(newChild, NavigableString):
+            newChild = NavigableString(newChild)        
+
+        position =  min(position, len(self.contents))
+        if hasattr(newChild, 'parent') and newChild.parent != None:
+            # We're 'inserting' an element that's already one
+            # of this object's children. 
+            if newChild.parent == self:
+                index = self.find(newChild)
+                if index and index < position:
+                    # Furthermore we're moving it further down the
+                    # list of this object's children. That means that
+                    # when we extract this element, our target index
+                    # will jump down one.
+                    position = position - 1
+            newChild.extract()
+            
+        newChild.parent = self
+        previousChild = None
+        if position == 0:
+            newChild.previousSibling = None
+            newChild.previous = self
+        else:
+            previousChild = self.contents[position-1]
+            newChild.previousSibling = previousChild
+            newChild.previousSibling.nextSibling = newChild
+            newChild.previous = previousChild._lastRecursiveChild()
+        if newChild.previous:
+            newChild.previous.next = newChild        
+
+        newChildsLastElement = newChild._lastRecursiveChild()
+
+        if position >= len(self.contents):
+            newChild.nextSibling = None
+            
+            parent = self
+            parentsNextSibling = None
+            while not parentsNextSibling:
+                parentsNextSibling = parent.nextSibling
+                parent = parent.parent
+                if not parent: # This is the last element in the document.
+                    break
+            if parentsNextSibling:
+                newChildsLastElement.next = parentsNextSibling
+            else:
+                newChildsLastElement.next = None
+        else:
+            nextChild = self.contents[position]            
+            newChild.nextSibling = nextChild            
+            if newChild.nextSibling:
+                newChild.nextSibling.previousSibling = newChild
+            newChildsLastElement.next = nextChild
+
+        if newChildsLastElement.next:
+            newChildsLastElement.next.previous = newChildsLastElement
+        self.contents.insert(position, newChild)
+
+    def findNext(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the first item that matches the given criteria and
         appears after this Tag in the document."""
-        return self._first(self.fetchNext, name, attrs, text)
-    firstNext = findNext
+        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 
-    def fetchNext(self, name=None, attrs={}, text=None, limit=None):
+    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+                    **kwargs):
         """Returns all items that match the given criteria and appear
         before after Tag in the document."""
-        return self._fetch(name, attrs, text, limit, self.nextGenerator)
+        return self._findAll(name, attrs, text, limit, self.nextGenerator)
 
-    def findNextSibling(self, name=None, attrs={}, text=None):
+    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the closest sibling to this Tag that matches the
         given criteria and appears after this Tag in the document."""
-        return self._first(self.fetchNextSiblings, name, attrs, text)
-    firstNextSibling = findNextSibling
+        return self._findOne(self.findNextSiblings, name, attrs, text,
+                             **kwargs)
 
-    def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
+    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+                         **kwargs):
         """Returns the siblings of this Tag that match the given
         criteria and appear after this Tag in the document."""
-        return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
+        return self._findAll(name, attrs, text, limit,
+                             self.nextSiblingGenerator, **kwargs)
+    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 
-    def findPrevious(self, name=None, attrs={}, text=None):
+    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the first item that matches the given criteria and
         appears before this Tag in the document."""
-        return self._first(self.fetchPrevious, name, attrs, text)
+        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 
-    def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
+    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+                        **kwargs):
         """Returns all items that match the given criteria and appear
         before this Tag in the document."""
-        return self._fetch(name, attrs, text, limit, self.previousGenerator)
-    firstPrevious = findPrevious
+        return self._findAll(name, attrs, text, limit, self.previousGenerator,
+                           **kwargs)
+    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 
-    def findPreviousSibling(self, name=None, attrs={}, text=None):
+    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the closest sibling to this Tag that matches the
         given criteria and appears before this Tag in the document."""
-        return self._first(self.fetchPreviousSiblings, name, attrs, text)
-    firstPreviousSibling = findPreviousSibling
+        return self._findOne(self.findPreviousSiblings, name, attrs, text,
+                             **kwargs)
 
-    def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
-                              limit=None):
+    def findPreviousSiblings(self, name=None, attrs={}, text=None,
+                             limit=None, **kwargs):
         """Returns the siblings of this Tag that match the given
         criteria and appear before this Tag in the document."""
-        return self._fetch(name, attrs, text, limit,
-                           self.previousSiblingGenerator)
+        return self._findAll(name, attrs, text, limit,
+                             self.previousSiblingGenerator, **kwargs)
+    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 
-    def findParent(self, name=None, attrs={}):
+    def findParent(self, name=None, attrs={}, **kwargs):
         """Returns the closest parent of this Tag that matches the given
         criteria."""
-        r = Null
-        l = self.fetchParents(name, attrs, 1)
+        # NOTE: We can't use _findOne because findParents takes a different
+        # set of arguments.
+        r = None
+        l = self.findParents(name, attrs, 1)
         if l:
             r = l[0]
         return r
-    firstParent = findParent
 
-    def fetchParents(self, name=None, attrs={}, limit=None):
+    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
         """Returns the parents of this Tag that match the given
         criteria."""
-        return self._fetch(name, attrs, None, limit, self.parentGenerator)
 
+        return self._findAll(name, attrs, None, limit, self.parentGenerator,
+                             **kwargs)
+    fetchParents = findParents # Compatibility with pre-3.x
+
     #These methods do the real heavy lifting.
 
-    def _first(self, method, name, attrs, text):
-        r = Null
-        l = method(name, attrs, text, 1)
+    def _findOne(self, method, name, attrs, text, **kwargs):
+        r = None
+        l = method(name, attrs, text, 1, **kwargs)
         if l:
             r = l[0]
         return r
     
-    def _fetch(self, name, attrs, text, limit, generator):
+    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
         "Iterates over a generator looking for things that match."
-        if not hasattr(attrs, 'items'):
-            attrs = {'class' : attrs}
 
-        results = []
+        if isinstance(name, SoupStrainer):
+            strainer = name
+        else:
+            # Build a SoupStrainer
+            strainer = SoupStrainer(name, attrs, text, **kwargs)
+        results = ResultSet(strainer)
         g = generator()
         while True:
             try:
                 i = g.next()
             except StopIteration:
                 break
-            found = None
-            if isinstance(i, Tag):
-                if not text:
-                    if not name or self._matches(i, name):
-                        match = True
-                        for attr, matchAgainst in attrs.items():
-                            check = i.get(attr)
-                            if not self._matches(check, matchAgainst):
-                                match = False
-                                break
-                        if match:
-                            found = i
-            elif text:
-                if self._matches(i, text):
-                    found = i                    
-            if found:
-                results.append(found)
-                if limit and len(results) >= limit:
-                    break
+            if i:
+                found = strainer.search(i)
+                if found:
+                    results.append(found)
+                    if limit and len(results) >= limit:
+                        break
         return results
 
-    #Generators that can be used to navigate starting from both
-    #NavigableTexts and Tags.                
+    #These Generators can be used to navigate starting from both
+    #NavigableStrings and Tags.                
     def nextGenerator(self):
         i = self
         while i:
@@ -231,56 +329,91 @@
             i = i.parent
             yield i
 
-    def _matches(self, chunk, howToMatch):
-        #print 'looking for %s in %s' % (howToMatch, chunk)
-        #
-        # If given a list of items, return true if the list contains a
-        # text element that matches.
-        if isList(chunk) and not isinstance(chunk, Tag):
-            for tag in chunk:
-                if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
-                    return True
-            return False
-        if callable(howToMatch):
-            return howToMatch(chunk)
-        if isinstance(chunk, Tag):
-            #Custom match methods take the tag as an argument, but all other
-            #ways of matching match the tag name as a string
-            chunk = chunk.name
-        #Now we know that chunk is a string
-        if not isinstance(chunk, basestring):
-            chunk = str(chunk)
-        if hasattr(howToMatch, 'match'):
-            # It's a regexp object.
-            return howToMatch.search(chunk)
-        if isList(howToMatch):
-            return chunk in howToMatch
-        if hasattr(howToMatch, 'items'):
-            return howToMatch.has_key(chunk)
-        #It's just a string
-        return str(howToMatch) == chunk
+    # Utility methods
+    def substituteEncoding(self, str, encoding=None):
+        encoding = encoding or "utf-8"
+        return str.replace("%SOUP-ENCODING%", encoding)    
 
-class NavigableText(PageElement):
+    def toEncoding(self, s, encoding=None):
+        """Encodes an object to a string in some encoding, or to Unicode.
+        ."""
+        if isinstance(s, unicode):
+            if encoding:
+                s = s.encode(encoding)
+        elif isinstance(s, str):
+            if encoding:
+                s = s.encode(encoding)
+            else:
+                s = unicode(s)
+        else:
+            if encoding:
+                s  = self.toEncoding(str(s), encoding)
+            else:
+                s = unicode(s)
+        return s
 
+class NavigableString(unicode, PageElement):
+
     def __getattr__(self, attr):
-        "For backwards compatibility, text.string gives you text"
+        """text.string gives you text. This is for backwards
+        compatibility for Navigable*String, but for CData* it lets you
+        get the string without the CData wrapper."""
         if attr == 'string':
             return self
         else:
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+    def __unicode__(self):
+        return __str__(self, None)
+
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        if encoding:
+            return self.encode(encoding)
+        else:
+            return self
         
-class NavigableString(str, NavigableText):
-    pass
+class CData(NavigableString):
 
-class NavigableUnicodeString(unicode, NavigableText):
-    pass
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 
+class ProcessingInstruction(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        output = self
+        if "%SOUP-ENCODING%" in output:
+            output = self.substituteEncoding(output, encoding)
+        return "<?%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!--%s-->" % NavigableString.__str__(self, encoding)    
+
+class Declaration(NavigableString):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!%s>" % NavigableString.__str__(self, encoding)        
+
 class Tag(PageElement):
-
     """Represents a found HTML tag with its attributes and contents."""
 
-    def __init__(self, name, attrs=None, parent=Null, previous=Null):
+    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
+                              "quot" : '"',
+                              "amp" : "&",
+                              "lt" : "<",
+                              "gt" : ">"
+                              }
+    # An RE for finding ampersands that aren't the start of of a
+    # numeric entity.
+    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+
+    def __init__(self, parser, name, attrs=None, parent=None,
+                 previous=None):
         "Basic constructor."
+
+        # We don't actually store the parser object: that lets extracted
+        # chunks be garbage-collected
+        self.parserClass = parser.__class__
+        self.isSelfClosing = parser.isSelfClosingTag(name)
+        self.convertHTMLEntities = parser.convertHTMLEntities
         self.name = name
         if attrs == None:
             attrs = []
@@ -288,6 +421,7 @@
         self.contents = []
         self.setup(parent, previous)
         self.hidden = False
+        self.containsSubstitutions = False
 
     def get(self, key, default=None):
         """Returns the value of the 'key' attribute for the tag, or
@@ -295,6 +429,9 @@
         attribute."""
         return self._getAttrMap().get(key, default)    
 
+    def has_key(self, key):
+        return self._getAttrMap().has_key(key)
+
     def __getitem__(self, key):
         """tag[key] returns the value of the 'key' attribute for the tag,
         and throws an exception if it's not there."""
@@ -342,15 +479,16 @@
 
     def __call__(self, *args, **kwargs):
         """Calling a tag like a function is the same as calling its
-        fetch() method. Eg. tag('a') returns a list of all the A tags
+        findAll() method. Eg. tag('a') returns a list of all the A tags
         found within this tag."""
-        return apply(self.fetch, args, kwargs)
+        return apply(self.findAll, args, kwargs)
 
     def __getattr__(self, tag):
+        #print "Getattr %s.%s" % (self.__class__, tag)
         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
-            return self.first(tag[:-3])
+            return self.find(tag[:-3])
         elif tag.find('__') != 0:
-            return self.first(tag)
+            return self.find(tag)
 
     def __eq__(self, other):
         """Returns true iff this tag has the same name, the same attributes,
@@ -370,39 +508,91 @@
         as defined in __eq__."""
         return not self == other
 
-    def __repr__(self):
+    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
-        return str(self)
+        return self.__str__(encoding)
 
     def __unicode__(self):
-        return self.__str__(1)
+        return self.__str__(None)
 
-    def __str__(self, needUnicode=None, showStructureIndent=None):
+    def _convertEntities(self, match):
+        x = match.group(1)
+        if x in name2codepoint:
+            return unichr(name2codepoint[x])            
+        elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
+            return '&%s;' % x
+        else:
+            return '&amp;%s;' % x
+
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                prettyPrint=False, indentLevel=0):
         """Returns a string or Unicode representation of this tag and
-        its contents.
+        its contents. To get Unicode, pass None for encoding.
 
         NOTE: since Python's HTML parser consumes whitespace, this
         method is not certain to reproduce the whitespace present in
         the original string."""
-        
+
+        encodedName = self.toEncoding(self.name, encoding)
+
         attrs = []
         if self.attrs:
             for key, val in self.attrs:
-                attrs.append('%s="%s"' % (key, val))
+                fmt = '%s="%s"'
+                if isString(val):                    
+                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+                        val = self.substituteEncoding(val, encoding)
+
+                    # The attribute value either:
+                    #
+                    # * Contains no embedded double quotes or single quotes.
+                    #   No problem: we enclose it in double quotes.
+                    # * Contains embedded single quotes. No problem:
+                    #   double quotes work here too.
+                    # * Contains embedded double quotes. No problem:
+                    #   we enclose it in single quotes.
+                    # * Embeds both single _and_ double quotes. This
+                    #   can't happen naturally, but it can happen if
+                    #   you modify an attribute value after parsing
+                    #   the document. Now we have a bit of a
+                    #   problem. We solve it by enclosing the
+                    #   attribute in single quotes, and escaping any
+                    #   embedded single quotes to XML entities.
+                    if '"' in val:
+                        # This can't happen naturally, but it can happen
+                        # if you modify an attribute value after parsing.
+                        if "'" in val:
+                            val = val.replace('"', "&quot;")
+                        else:
+                            fmt = "%s='%s'"
+
+                    # Optionally convert any HTML entities
+                    if self.convertHTMLEntities:
+                        val = re.sub("&(\w+);", self._convertEntities, val)
+
+                    # Now we're okay w/r/t quotes. But the attribute
+                    # value might also contain angle brackets, or
+                    # ampersands that aren't part of entities. We need
+                    # to escape those to XML entities too.
+                    val = val.replace("<", "&lt;").replace(">", "&gt;")
+                    val = self.BARE_AMPERSAND.sub("&amp;", val)
+
+                                      
+                attrs.append(fmt % (self.toEncoding(key, encoding),
+                                    self.toEncoding(val, encoding)))
         close = ''
         closeTag = ''
-        if self.isSelfClosing():
+        if self.isSelfClosing:
             close = ' /'
         else:
-            closeTag = '</%s>' % self.name
-        indentIncrement = None        
-        if showStructureIndent != None:
-            indentIncrement = showStructureIndent
-            if not self.hidden:
-                indentIncrement += 1
-        contents = self.renderContents(indentIncrement, needUnicode=needUnicode)        
-        if showStructureIndent:
-            space = '\n%s' % (' ' * showStructureIndent)
+            closeTag = '</%s>' % encodedName
+
+        indentTag, indentContents = 0, 0
+        if prettyPrint:
+            indentTag = indentLevel
+            space = (' ' * (indentTag-1))
+            indentContents = indentTag + 1
+        contents = self.renderContents(encoding, prettyPrint, indentContents)
         if self.hidden:
             s = contents
         else:
@@ -410,73 +600,61 @@
             attributeString = ''
             if attrs:
                 attributeString = ' ' + ' '.join(attrs)            
-            if showStructureIndent:
+            if prettyPrint:
                 s.append(space)
-            s.append('<%s%s%s>' % (self.name, attributeString, close))
+            s.append('<%s%s%s>' % (encodedName, attributeString, close))
+            if prettyPrint:
+                s.append("\n")
             s.append(contents)
-            if closeTag and showStructureIndent != None:
+            if prettyPrint and contents and contents[-1] != "\n":
+                s.append("\n")
+            if prettyPrint and closeTag:
                 s.append(space)
             s.append(closeTag)
+            if prettyPrint and closeTag and self.nextSibling:
+                s.append("\n")
             s = ''.join(s)
-        isUnicode = type(s) == types.UnicodeType
-        if needUnicode and not isUnicode:
-            s = unicode(s)
-        elif isUnicode and needUnicode==False:
-            s = str(s)
         return s
 
-    def prettify(self, needUnicode=None):
-        return self.__str__(needUnicode, showStructureIndent=True)
+    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return self.__str__(encoding, True)
 
-    def renderContents(self, showStructureIndent=None, needUnicode=None):
-        """Renders the contents of this tag as a (possibly Unicode) 
-        string."""
+    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                       prettyPrint=False, indentLevel=0):
+        """Renders the contents of this tag as a string in the given
+        encoding. If encoding is None, returns a Unicode string.."""
         s=[]
         for c in self:
             text = None
-            if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
-                text = unicode(c)
+            if isinstance(c, NavigableString):
+                text = c.__str__(encoding)
             elif isinstance(c, Tag):
-                s.append(c.__str__(needUnicode, showStructureIndent))
-            elif needUnicode:
-                text = unicode(c)
-            else:
-                text = str(c)
+                s.append(c.__str__(encoding, prettyPrint, indentLevel))
+            if text and prettyPrint:
+                text = text.strip()              
             if text:
-                if showStructureIndent != None:
-                    if text[-1] == '\n':
-                        text = text[:-1]
+                if prettyPrint:
+                    s.append(" " * (indentLevel-1))
                 s.append(text)
+                if prettyPrint:
+                    s.append("\n")
         return ''.join(s)    
 
     #Soup methods
 
-    def firstText(self, text, recursive=True):
-        """Convenience method to retrieve the first piece of text matching the
-        given criteria. 'text' can be a string, a regular expression object,
-        a callable that takes a string and returns whether or not the
-        string 'matches', etc."""
-        return self.first(recursive=recursive, text=text)
-
-    def fetchText(self, text, recursive=True, limit=None):
-        """Convenience method to retrieve all pieces of text matching the
-        given criteria. 'text' can be a string, a regular expression object,
-        a callable that takes a string and returns whether or not the
-        string 'matches', etc."""
-        return self.fetch(recursive=recursive, text=text, limit=limit)
-
-    def first(self, name=None, attrs={}, recursive=True, text=None):
-        """Return only the first child of this
-        Tag matching the given criteria."""
-        r = Null
-        l = self.fetch(name, attrs, recursive, text, 1)
+    def find(self, name=None, attrs={}, recursive=True, text=None,
+             **kwargs):
+        """Return only the first child of this Tag matching the given
+        criteria."""
+        r = None
+        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
         if l:
             r = l[0]
         return r
-    findChild = first
+    findChild = find
 
-    def fetch(self, name=None, attrs={}, recursive=True, text=None,
-              limit=None):
+    def findAll(self, name=None, attrs={}, recursive=True, text=None,
+                limit=None, **kwargs):
         """Extracts a list of Tag objects that match the given
         criteria.  You can specify the name of the Tag and any
         attributes you want the Tag to have.
@@ -489,19 +667,21 @@
         generator = self.recursiveChildGenerator
         if not recursive:
             generator = self.childGenerator
-        return self._fetch(name, attrs, text, limit, generator)
-    fetchChildren = fetch
+        return self._findAll(name, attrs, text, limit, generator, **kwargs)
+    findChildren = findAll
+
+    # Pre-3.x compatibility methods
+    first = find
+    fetch = findAll
     
+    def fetchText(self, text=None, recursive=True, limit=None):
+        return self.findAll(text=text, recursive=recursive, limit=limit)
+
+    def firstText(self, text=None, recursive=True):
+        return self.find(text=text, recursive=recursive)
+    
     #Utility methods
 
-    def isSelfClosing(self):
-        """Returns true iff this is a self-closing tag as defined in the HTML
-        standard.
-
-        TODO: This is specific to BeautifulSoup and its subclasses, but it's
-        used by __str__"""
-        return self.name in BeautifulSoup.SELF_CLOSING_TAGS
-
     def append(self, tag):
         """Appends the given tag to the contents of this tag."""
         self.contents.append(tag)
@@ -538,17 +718,153 @@
                         break
         raise StopIteration
 
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+    """Encapsulates a number of ways of matching a markup element (tag or
+    text)."""
 
+    def __init__(self, name=None, attrs={}, text=None, **kwargs):
+        self.name = name
+        if isString(attrs):
+            kwargs['class'] = attrs
+            attrs = None
+        if kwargs:
+            if attrs:
+                attrs = attrs.copy()
+                attrs.update(kwargs)
+            else:
+                attrs = kwargs
+        self.attrs = attrs
+        self.text = text
+
+    def __str__(self):
+        if self.text:
+            return self.text
+        else:
+            return "%s|%s" % (self.name, self.attrs)
+    
+    def searchTag(self, markupName=None, markupAttrs={}):
+        found = None
+        markup = None
+        if isinstance(markupName, Tag):
+            markup = markupName
+            markupAttrs = markup
+        callFunctionWithTagData = callable(self.name) \
+                                and not isinstance(markupName, Tag)
+
+        if (not self.name) \
+               or callFunctionWithTagData \
+               or (markup and self._matches(markup, self.name)) \
+               or (not markup and self._matches(markupName, self.name)):
+            if callFunctionWithTagData:
+                match = self.name(markupName, markupAttrs)
+            else:
+                match = True            
+                markupAttrMap = None
+                for attr, matchAgainst in self.attrs.items():
+                    if not markupAttrMap:
+                         if hasattr(markupAttrs, 'get'):
+                            markupAttrMap = markupAttrs
+                         else:
+                            markupAttrMap = {}
+                            for k,v in markupAttrs:
+                                markupAttrMap[k] = v
+                    attrValue = markupAttrMap.get(attr)
+                    if not self._matches(attrValue, matchAgainst):
+                        match = False
+                        break
+            if match:
+                if markup:
+                    found = markup
+                else:
+                    found = markupName
+        return found
+
+    def search(self, markup):
+        #print 'looking for %s in %s' % (self, markup)
+        found = None
+        # If given a list of items, scan it for a text element that
+        # matches.        
+        if isList(markup) and not isinstance(markup, Tag):
+            for element in markup:
+                if isinstance(element, NavigableString) \
+                       and self.search(element):
+                    found = element
+                    break
+        # If it's a Tag, make sure its name or attributes match.
+        # Don't bother with Tags if we're searching for text.
+        elif isinstance(markup, Tag):
+            if not self.text:
+                found = self.searchTag(markup)
+        # If it's text, make sure the text matches.
+        elif isinstance(markup, NavigableString) or \
+                 isString(markup):
+            if self._matches(markup, self.text):
+                found = markup
+        else:
+            raise Exception, "I don't know how to match against a %s" \
+                  % markup.__class__
+        return found
+        
+    def _matches(self, markup, matchAgainst):    
+        #print "Matching %s against %s" % (markup, matchAgainst)
+        result = False
+        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
+            result = markup != None
+        elif callable(matchAgainst):
+            result = matchAgainst(markup)
+        else:
+            #Custom match methods take the tag as an argument, but all
+            #other ways of matching match the tag name as a string.
+            if isinstance(markup, Tag):
+                markup = markup.name
+            if markup and not isString(markup):
+                markup = unicode(markup)
+            #Now we know that chunk is either a string, or None.
+            if hasattr(matchAgainst, 'match'):
+                # It's a regexp object.
+                result = markup and matchAgainst.search(markup)
+            elif isList(matchAgainst):
+                result = markup in matchAgainst
+            elif hasattr(matchAgainst, 'items'):
+                result = markup.has_key(matchAgainst)
+            elif matchAgainst and isString(markup):
+                if isinstance(markup, unicode):
+                    matchAgainst = unicode(matchAgainst)
+                else:
+                    matchAgainst = str(matchAgainst)
+
+            if not result:
+                result = matchAgainst == markup
+        return result
+
+class ResultSet(list):
+    """A ResultSet is just a list that keeps track of the SoupStrainer
+    that created it."""
+    def __init__(self, source):
+        list.__init__([])
+        self.source = source
+
+# Now, some helper functions.
+
 def isList(l):
     """Convenience method that works with all 2.x versions of Python
     to determine whether or not something is listlike."""
     return hasattr(l, '__iter__') \
            or (type(l) in (types.ListType, types.TupleType))
 
+def isString(s):
+    """Convenience method that works with all 2.x versions of Python
+    to determine whether or not something is stringlike."""
+    try:
+        return isinstance(s, unicode) or isintance(s, basestring) 
+    except NameError:
+        return isinstance(s, str)
+
 def buildTagMap(default, *args):
     """Turns a list of maps, lists, or scalars into a single map.
-    Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
-    of lists and partial maps."""
+    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+    NESTING_RESET_TAGS maps out of lists and partial maps."""
     built = {}
     for portion in args:
         if hasattr(portion, 'items'):
@@ -564,9 +880,11 @@
             built[portion] = default
     return built
 
+# Now, the parser classes.
+
 class BeautifulStoneSoup(Tag, SGMLParser):
 
-    """This class contains the basic parser and fetch code. It defines
+    """This class contains the basic parser and search code. It defines
     a parser that knows nothing about tag behavior except for the
     following:
    
@@ -587,97 +905,108 @@
     RESET_NESTING_TAGS = {}
     QUOTE_TAGS = {}
 
-    #As a public service we will by default silently replace MS smart quotes
-    #and similar characters with their HTML or ASCII equivalents.
-    MS_CHARS = { '\x80' : '&euro;',
-                 '\x81' : ' ',
-                 '\x82' : '&sbquo;',
-                 '\x83' : '&fnof;',
-                 '\x84' : '&bdquo;',
-                 '\x85' : '&hellip;',
-                 '\x86' : '&dagger;',
-                 '\x87' : '&Dagger;',
-                 '\x88' : '&caret;',
-                 '\x89' : '%',
-                 '\x8A' : '&Scaron;',
-                 '\x8B' : '&lt;',
-                 '\x8C' : '&OElig;',
-                 '\x8D' : '?',
-                 '\x8E' : 'Z',
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : '&lsquo;',
-                 '\x92' : '&rsquo;',
-                 '\x93' : '&ldquo;',
-                 '\x94' : '&rdquo;',
-                 '\x95' : '&bull;',
-                 '\x96' : '&ndash;',
-                 '\x97' : '&mdash;',
-                 '\x98' : '&tilde;',
-                 '\x99' : '&trade;',
-                 '\x9a' : '&scaron;',
-                 '\x9b' : '&gt;',
-                 '\x9c' : '&oelig;',
-                 '\x9d' : '?',
-                 '\x9e' : 'z',
-                 '\x9f' : '&Yuml;',}
-
-    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
-                       lambda(x):x.group(1) + ' />'),
+    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+                       lambda x: x.group(1) + ' />'),
                       (re.compile('<!\s+([^<>]*)>'),
-                       lambda(x):'<!' + x.group(1) + '>'),
-                      (re.compile("([\x80-\x9f])"),
-                       lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
+                       lambda x: '<!' + x.group(1) + '>')
                       ]
 
-    ROOT_TAG_NAME = '[document]'
+    ROOT_TAG_NAME = u'[document]'
 
-    def __init__(self, text=None, avoidParserProblems=True,
-                 initialTextIsEverything=True):
-        """Initialize this as the 'root tag' and feed in any text to
-        the parser.
+    HTML_ENTITIES = "html"
+    XML_ENTITIES = "xml"
+    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
 
-        NOTE about avoidParserProblems: sgmllib will process most bad
-        HTML, and BeautifulSoup has tricks for dealing with some HTML
-        that kills sgmllib, but Beautiful Soup can nonetheless choke
-        or lose data if your data uses self-closing tags or
-        declarations incorrectly. By default, Beautiful Soup sanitizes
-        its input to avoid the vast majority of these problems. The
-        problems are relatively rare, even in bad HTML, so feel free
-        to pass in False to avoidParserProblems if they don't apply to
-        you, and you'll get better performance. The only reason I have
-        this turned on by default is so I don't get so many tech
-        support questions.
+    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
+                 convertEntities=None, selfClosingTags=None):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser. 
 
-        The two most common instances of invalid HTML that will choke
-        sgmllib are fixed by the default parser massage techniques:
+        sgmllib will process most bad HTML, and the BeautifulSoup
+        class has some tricks for dealing with some HTML that kills
+        sgmllib, but Beautiful Soup can nonetheless choke or lose data
+        if your data uses self-closing tags or declarations
+        incorrectly.
 
+        By default, Beautiful Soup uses regexes to sanitize input,
+        avoiding the vast majority of these problems. If the problems
+        don't apply to you, pass in False for markupMassage, and
+        you'll get better performance.
+
+        The default parser massage techniques fix the two most common
+        instances of invalid HTML that choke sgmllib:
+
          <br/> (No space between name of closing tag and tag close)
          <! --Comment--> (Extraneous whitespace in declaration)
 
         You can pass in a custom list of (RE object, replace method)
         tuples to get Beautiful Soup to scrub your input the way you
         want."""
-        Tag.__init__(self, self.ROOT_TAG_NAME)
-        if avoidParserProblems \
-           and not isList(avoidParserProblems):
-            avoidParserProblems = self.PARSER_MASSAGE            
-        self.avoidParserProblems = avoidParserProblems
+
+        self.parseOnlyThese = parseOnlyThese
+        self.fromEncoding = fromEncoding
+        self.smartQuotesTo = smartQuotesTo
+
+        if convertEntities:
+            # It doesn't make sense to convert encoded characters to
+            # entities even while you're converting entities to Unicode.
+            # Just convert it all to Unicode.
+            self.smartQuotesTo = None
+
+        if isList(convertEntities):
+            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
+            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
+        else:
+            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
+            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
+
+        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
         SGMLParser.__init__(self)
-        self.quoteStack = []
-        self.hidden = 1
+            
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        self.markup = markup
+        self.markupMassage = markupMassage
+        try:
+            self._feed()
+        except StopParsing:
+            pass
+        self.markup = None                 # The markup can now be GCed
+
+    def _feed(self, inDocumentEncoding=None):
+        # Convert the document to Unicode.
+        markup = self.markup
+        if isinstance(markup, unicode):
+            if not hasattr(self, 'originalEncoding'):
+                self.originalEncoding = None
+        else:
+            dammit = UnicodeDammit\
+                     (markup, [self.fromEncoding, inDocumentEncoding],
+                      smartQuotesTo=self.smartQuotesTo)
+            markup = dammit.unicode
+            self.originalEncoding = dammit.originalEncoding
+        if markup:
+            if self.markupMassage:
+                if not isList(self.markupMassage):
+                    self.markupMassage = self.MARKUP_MASSAGE            
+                for fix, m in self.markupMassage:
+                    markup = fix.sub(m, markup)
         self.reset()
-        if hasattr(text, 'read'):
-            #It's a file-type object.
-            text = text.read()
-        if text:
-            self.feed(text)
-        if initialTextIsEverything:
-            self.done()
 
+        SGMLParser.feed(self, markup or "")
+        SGMLParser.close(self)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
     def __getattr__(self, methodName):
         """This method routes method call requests to either the SGMLParser
         superclass or the Tag superclass, depending on the method name."""
+        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
                or methodName.find('do_') == 0:
             return SGMLParser.__getattr__(self, methodName)
@@ -686,25 +1015,21 @@
         else:
             raise AttributeError
 
-    def feed(self, text):
-        if self.avoidParserProblems:
-            for fix, m in self.avoidParserProblems:
-                text = fix.sub(m, text)
-        SGMLParser.feed(self, text)
-
-    def done(self):
-        """Called when you're done parsing, so that the unclosed tags can be
-        correctly processed."""
-        self.endData() #NEW
-        while self.currentTag.name != self.ROOT_TAG_NAME:
-            self.popTag()
+    def isSelfClosingTag(self, name):
+        """Returns true iff the given string is the name of a
+        self-closing tag according to this parser."""
+        return self.SELF_CLOSING_TAGS.has_key(name) \
+               or self.instanceSelfClosingTags.has_key(name)
             
     def reset(self):
+        Tag.__init__(self, self, self.ROOT_TAG_NAME)
+        self.hidden = 1
         SGMLParser.reset(self)
         self.currentData = []
         self.currentTag = None
         self.tagStack = []
-        self.pushTag(self)        
+        self.quoteStack = []
+        self.pushTag(self)
     
     def popTag(self):
         tag = self.tagStack.pop()
@@ -712,7 +1037,7 @@
         # 'string' property, so that soup.tag.string is shorthand for
         # soup.tag.contents[0]
         if len(self.currentTag.contents) == 1 and \
-           isinstance(self.currentTag.contents[0], NavigableText):
+           isinstance(self.currentTag.contents[0], NavigableString):
             self.currentTag.string = self.currentTag.contents[0]
 
         #print "Pop", tag.name
@@ -727,30 +1052,35 @@
         self.tagStack.append(tag)
         self.currentTag = self.tagStack[-1]
 
-    def endData(self):
-        currentData = ''.join(self.currentData)
-        if currentData:
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = ''.join(self.currentData)
+            if currentData.endswith('<') and self.convertHTMLEntities:
+                currentData = currentData[:-1] + '&lt;'
             if not currentData.strip():
                 if '\n' in currentData:
                     currentData = '\n'
                 else:
                     currentData = ' '
-            c = NavigableString
-            if type(currentData) == types.UnicodeType:
-                c = NavigableUnicodeString
-            o = c(currentData)
+            self.currentData = []
+            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+                   (not self.parseOnlyThese.text or \
+                    not self.parseOnlyThese.search(currentData)):
+                return
+            o = containerClass(currentData)
             o.setup(self.currentTag, self.previous)
             if self.previous:
                 self.previous.next = o
             self.previous = o
             self.currentTag.contents.append(o)
-        self.currentData = []
 
+
     def _popToTag(self, name, inclusivePop=True):
         """Pops the tag stack up to and including the most recent
         instance of the given tag. If inclusivePop is false, pops the tag
         stack up to but *not* including the most recent instqance of
         the given tag."""
+        #print "Popping to %s" % name
         if name == self.ROOT_TAG_NAME:
             return            
 
@@ -807,7 +1137,6 @@
                 #peculiar to this tag, or we encounter another tag
                 #that causes nesting to reset, pop up to but not
                 #including that tag.
-
                 popTo = p.name
                 inclusive = False
                 break
@@ -816,33 +1145,41 @@
             self._popToTag(popTo, inclusive)
 
     def unknown_starttag(self, name, attrs, selfClosing=0):
-        #print "Start tag %s" % name
+        #print "Start tag %s: %s" % (name, attrs)
         if self.quoteStack:
             #This is not a real tag.
             #print "<%s> is not real!" % name
             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
-            self.handle_data('<%s%s>' % (name, attrs))
-            return
+            self.currentData.append('<%s%s>' % (name, attrs))
+            return        
         self.endData()
-        if not name in self.SELF_CLOSING_TAGS and not selfClosing:
+
+        if not self.isSelfClosingTag(name) and not selfClosing:
             self._smartPop(name)
-        tag = Tag(name, attrs, self.currentTag, self.previous)        
+
+        if self.parseOnlyThese and len(self.tagStack) <= 1 \
+               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+            return
+
+        tag = Tag(self, name, attrs, self.currentTag, self.previous)
         if self.previous:
             self.previous.next = tag
         self.previous = tag
         self.pushTag(tag)
-        if selfClosing or name in self.SELF_CLOSING_TAGS:
+        if selfClosing or self.isSelfClosingTag(name):
             self.popTag()                
         if name in self.QUOTE_TAGS:
             #print "Beginning quote (%s)" % name
             self.quoteStack.append(name)
             self.literal = 1
+        return tag
 
     def unknown_endtag(self, name):
+        #print "End tag %s" % name
         if self.quoteStack and self.quoteStack[-1] != name:
             #This is not a real end tag.
             #print "</%s> is not real!" % name
-            self.handle_data('</%s>' % name)
+            self.currentData.append('</%s>' % name)
             return
         self.endData()
         self._popToTag(name)
@@ -851,38 +1188,81 @@
             self.literal = (len(self.quoteStack) > 0)
 
     def handle_data(self, data):
+        if self.convertHTMLEntities:
+            if data[0] == '&':
+                data = self.BARE_AMPERSAND.sub("&amp;",data)
+            else:
+                data = data.replace('&','&amp;') \
+                           .replace('<','&lt;') \
+                           .replace('>','&gt;')
         self.currentData.append(data)
 
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.endData()
+        self.handle_data(text)
+        self.endData(subclass)
+
     def handle_pi(self, text):
-        "Propagate processing instructions right through."
-        self.handle_data("<?%s>" % text)
+        """Handle a processing instruction as a ProcessingInstruction
+        object, possibly one with a %SOUP-ENCODING% slot into which an
+        encoding will be plugged later."""
+        if text[:3] == "xml":
+            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
+        self._toStringSubclass(text, ProcessingInstruction)
 
     def handle_comment(self, text):
-        "Propagate comments right through."
-        self.handle_data("<!--%s-->" % text)
+        "Handle comments as Comment objects."
+        self._toStringSubclass(text, Comment)
 
     def handle_charref(self, ref):
-        "Propagate char refs right through."
-        self.handle_data('&#%s;' % ref)
+        "Handle character references as data."
+        if ref[0] == 'x':
+            data = unichr(int(ref[1:],16))
+        else:
+            data = unichr(int(ref))
+        
+        if u'\x80' <= data <= u'\x9F':
+            data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
+        elif not self.convertHTMLEntities and not self.convertXMLEntities:
+            data = '&#%s;' % ref
 
+        self.handle_data(data)
+
     def handle_entityref(self, ref):
-        "Propagate entity refs right through."
-        self.handle_data('&%s;' % ref)
+        """Handle entity references as data, possibly converting known
+        HTML entity references to the corresponding Unicode
+        characters."""
+        replaceWithXMLEntity = self.convertXMLEntities and \
+                               self.XML_ENTITIES_TO_CHARS.has_key(ref)
+        if self.convertHTMLEntities or replaceWithXMLEntity:
+            try:
+                data = unichr(name2codepoint[ref])
+            except KeyError:
+                if replaceWithXMLEntity:
+                    data = self.XML_ENTITIES_TO_CHARS.get(ref)
+                else:
+                    data="&amp;%s" % ref
+        else:
+            data = '&%s;' % ref
+        self.handle_data(data)
         
     def handle_decl(self, data):
-        "Propagate DOCTYPEs and the like right through."
-        self.handle_data('<!%s>' % data)
+        "Handle DOCTYPEs and the like as Declaration objects."
+        self._toStringSubclass(data, Declaration)
 
     def parse_declaration(self, i):
         """Treat a bogus SGML declaration as raw data. Treat a CDATA
-        declaration as regular data."""
+        declaration as a CData object."""
         j = None
         if self.rawdata[i:i+9] == '<![CDATA[':
              k = self.rawdata.find(']]>', i)
              if k == -1:
                  k = len(self.rawdata)
-             self.handle_data(self.rawdata[i+9:k])
+             data = self.rawdata[i+9:k]
              j = k+3
+             self._toStringSubclass(data, CData)
         else:
             try:
                 j = SGMLParser.parse_declaration(self, i)
@@ -937,12 +1317,18 @@
     Differing assumptions about tag nesting rules are a major source
     of problems with the BeautifulSoup class. If BeautifulSoup is not
     treating as nestable a tag your page author treats as nestable,
-    try ICantBelieveItsBeautifulSoup before writing your own
-    subclass."""
+    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+    BeautifulStoneSoup before writing your own subclass."""
 
-    SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
-                                           'spacer', 'link', 'frame', 'base'])
+    def __init__(self, *args, **kwargs):
+        if not kwargs.has_key('smartQuotesTo'):
+            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+        BeautifulStoneSoup.__init__(self, *args, **kwargs)
 
+    SELF_CLOSING_TAGS = buildTagMap(None,
+                                    ['br' , 'hr', 'input', 'img', 'meta',
+                                    'spacer', 'link', 'frame', 'base'])
+
     QUOTE_TAGS = {'script': None}
     
     #According to the HTML standard, each of these inline tags can
@@ -969,6 +1355,9 @@
                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
                            'td' : ['tr'],
                            'th' : ['tr'],
+                           'thead' : ['table'],
+                           'tbody' : ['table'],
+                           'tfoot' : ['table'],
                            }
 
     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
@@ -982,7 +1371,57 @@
 
     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
-    
+
+    # Used to detect the charset in a META tag; see start_meta
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
+
+    def start_meta(self, attrs):
+        """Beautiful Soup can detect a charset included in a META tag,
+        try to convert the document to that charset, and re-parse the
+        document from the beginning."""
+        httpEquiv = None
+        contentType = None
+        contentTypeIndex = None
+        tagNeedsEncodingSubstitution = False
+
+        for i in range(0, len(attrs)):
+            key, value = attrs[i]
+            key = key.lower()
+            if key == 'http-equiv':
+                httpEquiv = value
+            elif key == 'content':
+                contentType = value
+                contentTypeIndex = i
+
+        if httpEquiv and contentType: # It's an interesting meta tag.
+            match = self.CHARSET_RE.search(contentType)
+            if match:
+                if getattr(self, 'declaredHTMLEncoding') or \
+                       (self.originalEncoding == self.fromEncoding):
+                    # This is our second pass through the document, or
+                    # else an encoding was specified explicitly and it
+                    # worked. Rewrite the meta tag.
+                    newAttr = self.CHARSET_RE.sub\
+                              (lambda(match):match.group(1) +
+                               "%SOUP-ENCODING%", value)
+                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+                                               newAttr)
+                    tagNeedsEncodingSubstitution = True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the new information.
+                    newCharset = match.group(3)
+                    if newCharset and newCharset != self.originalEncoding:
+                        self.declaredHTMLEncoding = newCharset
+                        self._feed(self.declaredHTMLEncoding)
+                        raise StopParsing
+        tag = self.unknown_starttag("meta", attrs)
+        if tag and tagNeedsEncodingSubstitution:
+            tag.containsSubstitutions = True
+
+class StopParsing(Exception):
+    pass
+   
 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
 
     """The BeautifulSoup class is oriented towards skipping over
@@ -1001,16 +1440,13 @@
     can throw off the rest of your document structure. The same is
     true of a number of other tags, listed below.
 
-    It's much more common for someone to forget to close (eg.) a 'b'
-    tag than to actually use nested 'b' tags, and the BeautifulSoup
-    class handles the common case. This class handles the
-    not-co-common case: where you can't believe someone wrote what
-    they did, but it's valid HTML and BeautifulSoup screwed up by
-    assuming it wouldn't be.
+    It's much more common for someone to forget to close a 'b' tag
+    than to actually use nested 'b' tags, and the BeautifulSoup class
+    handles the common case. This class handles the not-co-common
+    case: where you can't believe someone wrote what they did, but
+    it's valid HTML and BeautifulSoup screwed up by assuming it
+    wouldn't be."""
 
-    If this doesn't do what you need, try subclassing this class or
-    BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
-
     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
@@ -1022,6 +1458,19 @@
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
 
+class MinimalSoup(BeautifulSoup):
+    """The MinimalSoup class is for parsing HTML that contains
+    pathologically bad markup. It makes no assumptions about tag
+    nesting, but it does know which tags are self-closing, that
+    <script> tags contain Javascript and should not be parsed, that
+    META tags may contain encoding information, and so on.
+
+    This also makes it better for subclassing than BeautifulStoneSoup
+    or BeautifulSoup."""
+    
+    RESET_NESTING_TAGS = buildTagMap('noscript')
+    NESTABLE_TAGS = {}
+
 class BeautifulSOAP(BeautifulStoneSoup):
     """This class will push a tag with only a single string child into
     the tag's parent as an attribute. The attribute's name is the tag
@@ -1048,7 +1497,7 @@
             parent = self.tagStack[-2]
             parent._getAttrMap()
             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
-                isinstance(tag.contents[0], NavigableText) and 
+                isinstance(tag.contents[0], NavigableString) and 
                 not parent.attrMap.has_key(tag.name)):
                 parent[tag.name] = tag.contents[0]
         BeautifulStoneSoup.popTag(self)
@@ -1067,14 +1516,297 @@
     pass
 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
     pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+    pass
 class SimplifyingSOAPParser(BeautifulSOAP):
     pass
 
-###
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode).  It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
 
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+    import chardet
+#    import chardet.constants
+#    chardet.constants._debug = 1
+except:
+    chardet = None
+chardet = None
 
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+    import cjkcodecs.aliases
+except:
+    pass
+try:
+    import iconv_codec
+except:
+    pass
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = { "macintosh" : "mac-roman",
+                        "x-sjis" : "shift-jis" }
+    
+    def __init__(self, markup, overrideEncodings=[],
+                 smartQuotesTo='xml'):
+        self.markup, documentEncoding, sniffedEncoding = \
+                     self._detectEncoding(markup)
+        self.smartQuotesTo = smartQuotesTo
+        self.triedEncodings = []
+        if isinstance(markup, unicode):
+            return markup
+
+        u = None
+        for proposedEncoding in overrideEncodings:
+            u = self._convertFrom(proposedEncoding)
+            if u: break
+        if not u:
+            for proposedEncoding in (documentEncoding, sniffedEncoding):
+                u = self._convertFrom(proposedEncoding)
+                if u: break
+                
+        # If no luck and we have auto-detection library, try that:
+        if not u and chardet and not isinstance(self.markup, unicode):
+            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convertFrom(proposed_encoding)
+                if u: break
+        self.unicode = u
+        if not u: self.originalEncoding = None
+
+    def subMSChar(orig, smartQuotesTo):
+        """Changes a MS smart quote character to an XML or HTML
+        entity."""
+        sub = UnicodeDammit.MS_CHARS.get(orig)
+        if type(sub) == types.TupleType:
+            if smartQuotesTo == 'xml':
+                sub = '&#x%s;' % sub[1]
+            elif smartQuotesTo == 'html':
+                sub = '&%s;' % sub[0]
+            else:
+                sub = unichr(int(sub[1],16))
+        return sub            
+    subMSChar = staticmethod(subMSChar)
+
+    def _convertFrom(self, proposed):        
+        proposed = self.find_codec(proposed)
+        if not proposed or proposed in self.triedEncodings:
+            return None
+        self.triedEncodings.append(proposed)
+        markup = self.markup
+
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if self.smartQuotesTo and proposed in("windows-1252",
+                                              "ISO-8859-1",
+                                              "ISO-8859-2"):
+            markup = re.compile("([\x80-\x9f])").sub \
+                     (lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
+                      markup)
+
+        try:
+            # print "Trying to convert document to %s" % proposed
+            u = self._toUnicode(markup, proposed)
+            self.markup = u       
+            self.originalEncoding = proposed
+        except Exception, e:
+            # print "That didn't work!"
+            # print e
+            return None        
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _toUnicode(self, data, encoding):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding)
+        return newdata
+    
+    def _detectEncoding(self, xml_data):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == '\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == '\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+                     and (xml_data[2:4] != '\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+                     (xml_data[2:4] != '\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == '\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == '\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == '\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+            xml_encoding_match = re.compile \
+                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
+                                 .match(xml_data)
+        except:
+            xml_encoding_match = None
+        if xml_encoding_match:
+            xml_encoding = xml_encoding_match.groups()[0].lower()
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset: return charset 
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except LookupError:
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    MS_CHARS = { '\x80' : ('euro', '20AC'),
+                 '\x81' : ' ',
+                 '\x82' : ('sbquo', '201A'),
+                 '\x83' : ('fnof', '192'),
+                 '\x84' : ('bdquo', '201E'),
+                 '\x85' : ('hellip', '2026'),
+                 '\x86' : ('dagger', '2020'),
+                 '\x87' : ('Dagger', '2021'),
+                 '\x88' : ('circ', '2C6'),
+                 '\x89' : ('permil', '2030'),
+                 '\x8A' : ('Scaron', '160'),
+                 '\x8B' : ('lsaquo', '2039'),
+                 '\x8C' : ('OElig', '152'),
+                 '\x8D' : '?',
+                 '\x8E' : ('#x17D', '17D'),
+                 '\x8F' : '?',
+                 '\x90' : '?',
+                 '\x91' : ('lsquo', '2018'),
+                 '\x92' : ('rsquo', '2019'),
+                 '\x93' : ('ldquo', '201C'),
+                 '\x94' : ('rdquo', '201D'),
+                 '\x95' : ('bull', '2022'),
+                 '\x96' : ('ndash', '2013'),
+                 '\x97' : ('mdash', '2014'),
+                 '\x98' : ('tilde', '2DC'),
+                 '\x99' : ('trade', '2122'),
+                 '\x9a' : ('scaron', '161'),
+                 '\x9b' : ('rsaquo', '203A'),
+                 '\x9c' : ('oelig', '153'),
+                 '\x9d' : '?',
+                 '\x9e' : ('#x17E', '17E'),
+                 '\x9f' : ('Yuml', '178'),}
+
+#######################################################################
+
+
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
-    soup = BeautifulStoneSoup(sys.stdin.read())
+    soup = BeautifulSoup(sys.stdin.read())
     print soup.prettify()

Modified: Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/zope/testbrowser/real/real.txt
===================================================================
--- Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/zope/testbrowser/real/real.txt	2006-08-14 13:14:44 UTC (rev 69467)
+++ Zope3/branches/benji-testbrowser-with-real-browsers-take-2/src/zope/testbrowser/real/real.txt	2006-08-14 13:33:13 UTC (rev 69468)
@@ -150,7 +150,7 @@
     >>> link = browser.getLink('Link Text')
     >>> link
     <Link text='Link Text'
-      url='http://localhost/@@/testbrowser/navigate.html?message=By+Link+Text'>
+      url=u'http://localhost/@@/testbrowser/navigate.html?message=By+Link+Text'>
 
 Link objects comply with the ILink interface.
 
@@ -162,11 +162,11 @@
     >>> link.text
     'Link Text'
     >>> link.tag # links can also be image maps.
-    'a'
+    u'a'
     >>> link.url # it's normalized
-    'http://localhost/@@/testbrowser/navigate.html?message=By+Link+Text'
+    u'http://localhost/@@/testbrowser/navigate.html?message=By+Link+Text'
     >>> link.attrs
-    {'href': 'navigate.html?message=By+Link+Text'}
+    {u'href': u'navigate.html?message=By+Link+Text'}
 
 Links can be "clicked" and the browser will navigate to the referenced URL.
 
@@ -233,7 +233,7 @@
     >>> browser.open('http://localhost/@@/testbrowser/navigate.html')
     >>> link = browser.getLink(id='zope3')
     >>> link.tag
-    'area'
+    u'area'
     >>> link.click()
     >>> browser.url
     'http://localhost/@@/testbrowser/navigate.html?message=Zope+3+Name'