[Zope3-checkins] SVN: Zope3/branches/benji-integrate-new-mechanize/src/ checkpoint some changes

Mon Jun 19 11:22:06 EDT 2006

Log message for revision 68754:
  checkpoint some changes

Changed:
  D   Zope3/branches/benji-integrate-new-mechanize/src/ClientCookie/
  U   Zope3/branches/benji-integrate-new-mechanize/src/ClientForm.py
  D   Zope3/branches/benji-integrate-new-mechanize/src/mechanize/
  U   Zope3/branches/benji-integrate-new-mechanize/src/pullparser.py
  U   Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/browser/README.txt
  U   Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/codemodule/browser/introspector.txt
  U   Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/browser.py
  U   Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/testing.py
  U   Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/tests.py

-=-
Modified: Zope3/branches/benji-integrate-new-mechanize/src/ClientForm.py
===================================================================

--- Zope3/branches/benji-integrate-new-mechanize/src/ClientForm.py	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/ClientForm.py	2006-06-19 15:22:02 UTC (rev 68754)
@@ -15,7 +15,7 @@
 HTML 4.01 Specification, W3C Recommendation 24 December 1999
 
 
-Copyright 2002-2005 John J. Lee <jjl at pobox.com>
+Copyright 2002-2006 John J. Lee <jjl at pobox.com>
 Copyright 2005 Gary Poster
 Copyright 2005 Zope Corporation
 Copyright 1998-2000 Gisle Aas.
@@ -27,44 +27,40 @@
 """
 
 # XXX
+# Remove unescape_attr method
+# Remove parser testing hack
+# safeUrl()-ize action
+# Really should to merge CC, CF, pp and mechanize as soon as mechanize
+#  goes to beta...
+# Add url attribute to ParseError
+# Switch to unicode throughout (would be 0.3.x)
+#  See Wichert Akkerman's 2004-01-22 message to c.l.py.
+# Add charset parameter to Content-type headers?  How to find value??
 # Add some more functional tests
 #  Especially single and multiple file upload on the internet.
 #  Does file upload work when name is missing?  Sourceforge tracker form
 #   doesn't like it.  Check standards, and test with Apache.  Test
 #   binary upload with Apache.
-# There have been reports that some servers are very picky about MIME
-#  boundaries, so file uploads may fail with those servers.  Should
-#  copy what IE does religiously.
-# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py.
 # Controls can have name=None (e.g. forms constructed partly with
 #  JavaScript), but find_control can't be told to find a control
 #  with that name, because None there means 'unspecified'.  Can still
 #  get at by nr, but would be nice to be able to specify something
 #  equivalent to name=None, too.
-# Deal with character sets properly.  Not sure what the issues are here.
-#  Do URL encodings need any attention?
-#  I don't *think* any encoding of control names, filenames or data is
-#   necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
-#   doesn't seem to do it.
-#  Add charset parameter to Content-type headers?  How to find value??
 # mailto submission & enctype text/plain
 # I'm not going to fix this unless somebody tells me what real servers
 #  that want this encoding actually expect: If enctype is
 #  application/x-www-form-urlencoded and there's a FILE control present.
 #  Strictly, it should be 'name=data' (see HTML 4.01 spec., section
 #  17.13.2), but I send "name=" ATM.  What about multiple file upload??
-# Get rid of MimeWriter.
-# Should really use sgmllib, not htmllib.
 
 # Would be nice, but I'm not going to do it myself:
 # -------------------------------------------------
-# Maybe a 0.3.x?
+# Maybe a 0.4.x?
 #   Replace by_label etc. with moniker / selector concept. Allows, eg.,
 #    a choice between selection by value / id / label / element
 #    contents.  Or choice between matching labels exactly or by
 #    substring.  Etc.
 #   Remove deprecated methods.
-#   action should probably be an absolute URI, like DOMForm.
 #   ...what else?
 # Work on DOMForm.
 # XForms?  Don't know if there's a need here.
@@ -81,8 +77,38 @@
         if expr: return True
         else: return False
 
+try:
+    import logging
+except ImportError:
+    def debug(msg, *args, **kwds):
+        pass
+else:
+    _logger = logging.getLogger("ClientForm")
+    OPTIMIZATION_HACK = True
+
+    def debug(msg, *args, **kwds):
+        if OPTIMIZATION_HACK:
+            return
+
+        try:
+            raise Exception()
+        except:
+            caller_name = (
+                sys.exc_info()[2].tb_frame.f_back.f_back.f_code.co_name)
+        extended_msg = '%%s %s' % msg
+        extended_args = (caller_name,)+args
+        debug = _logger.debug(extended_msg, *extended_args, **kwds)
+
+    def _show_debug_messages():
+        global OPTIMIZATION_HACK
+        OPTIMIZATION_HACK = False
+        _logger.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        _logger.addHandler(handler)
+
 import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
-       htmlentitydefs, re
+       htmlentitydefs, re, random
 from urlparse import urljoin
 from cStringIO import StringIO
 
@@ -95,10 +121,12 @@
     def deprecation(message):
         warnings.warn(message, DeprecationWarning, stacklevel=2)
 
-VERSION = "0.2.1a"
+VERSION = "0.2.2"
 
 CHUNK = 1024  # size of chunks fed to parser, in bytes
 
+DEFAULT_ENCODING = "latin-1"
+
 _compress_re = re.compile(r"\s+")
 def compress_text(text): return _compress_re.sub(" ", text.strip())
 
@@ -171,15 +199,62 @@
                         l.append(k + '=' + urllib.quote_plus(str(elt)))
     return '&'.join(l)
 
-def unescape(data, entities):
-    if data is None or '&' not in data:
+def unescape(data, entities, encoding=DEFAULT_ENCODING):
+    if data is None or "&" not in data:
         return data
-    def replace_entities(match, entities=entities):
+
+    def replace_entities(match, entities=entities, encoding=encoding):
         ent = match.group()
-        repl = entities.get(ent, ent)
+        if ent[1] == "#":
+            return unescape_charref(ent[2:-1], encoding)
+
+        repl = entities.get(ent)
+        if repl is not None:
+            if type(repl) != type(""):
+                try:
+                    repl = repl.encode(encoding)
+                except UnicodeError:
+                    repl = ent
+        else:
+            repl = ent
+
         return repl
-    return re.sub(r'&\S+?;', replace_entities, data)
 
+    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+    name, base = data, 10
+    if name.startswith("x"):
+        name, base= name[1:], 16
+    uc = unichr(int(name, base))
+    if encoding is None:
+        return uc
+    else:
+        try:
+            repl = uc.encode(encoding)
+        except UnicodeError:
+            repl = "&#%s;" % data
+        return repl
+
+def get_entitydefs():
+    import htmlentitydefs
+    from codecs import latin_1_decode
+    entitydefs = {}
+    try:
+        htmlentitydefs.name2codepoint
+    except AttributeError:
+        entitydefs = {}
+        for name, char in htmlentitydefs.entitydefs.items():
+            uc = latin_1_decode(char)[0]
+            if uc.startswith("&#") and uc.endswith(";"):
+                uc = unescape_charref(uc[2:-1], None)
+            entitydefs["&%s;" % name] = uc
+    else:
+        for name, codepoint in htmlentitydefs.name2codepoint.items():
+            entitydefs["&%s;" % name] = unichr(codepoint)
+    return entitydefs
+
+
 def issequence(x):
     try:
         x[0]
@@ -195,74 +270,15 @@
     else: return True
 
 
-# XXX don't really want to drag this along (MimeWriter, choose_boundary)
-
-# --------------------------------------------------------------------
-# grabbed from Python standard library mimetools module and tweaked to
-# avoid socket.gaierror and to avoid dots ('.') in MIME boundaries
-try:
-    import thread
-    _thread = thread; del thread
-except ImportError:
-    import dummy_thread
-    _thread = dummy_thread; del dummy_thread
-_counter_lock = _thread.allocate_lock()
-del _thread
-
-_counter = 0
-def _get_next_counter():
-    global _counter
-    _counter_lock.acquire()
-    _counter = _counter + 1
-    result = _counter
-    _counter_lock.release()
-    return result
-
-_prefix = None
-
 def choose_boundary():
-    """Return a string usable as a multipart boundary.
+    """Return a string usable as a multipart boundary."""
+    # follow IE and firefox
+    nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
+    return "-"*27 + nonce
 
-    The string chosen is unique within a single program run, and
-    incorporates the user id (if available), process id (if available),
-    and current time.  So it's very unlikely the returned string appears
-    in message text, but there's no guarantee.
-
-    The boundary contains dots so you have to quote it in the header."""
-
-    global _prefix
-    import time
-    import os
-    import socket
-    if _prefix is None:
-        try:
-            socket.gaierror
-        except AttributeError:
-            exc = socket.error
-        else:
-            exc = socket.gaierror
-
-        try:
-            hostid = socket.gethostbyname(socket.gethostname())
-        except exc:
-            hostid = 'localhost'
-        try:
-            uid = repr(os.getuid())
-        except AttributeError:
-            uid = '1'
-        try:
-            pid = repr(os.getpid())
-        except AttributeError:
-            pid = '1'
-        _prefix = hostid + uid + pid
-    return "%s%d%d" % (_prefix, long(time.time()*100), _get_next_counter())
-
-# end of code from mimetools module
-# --------------------------------------------------------------------
-
 # This cut-n-pasted MimeWriter from standard library is here so can add
 # to HTTP headers rather than message body when appropriate.  It also uses
-# \r\n in place of \n.  This is nasty.
+# \r\n in place of \n.  This is a bit nasty.
 class MimeWriter:
 
     """Generic MIME writer.
@@ -420,10 +436,11 @@
 class _AbstractFormParser:
     """forms attribute contains HTMLForm instances on completion."""
     # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
-    def __init__(self, entitydefs=None):
+    def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
         if entitydefs is None:
             entitydefs = get_entitydefs()
         self._entitydefs = entitydefs
+        self._encoding = encoding
 
         self.base = None
         self.forms = []
@@ -436,17 +453,21 @@
         self._textarea = None
 
     def do_base(self, attrs):
+        debug("%s", attrs)
         for key, value in attrs:
             if key == "href":
                 self.base = value
 
     def end_body(self):
+        debug("")
         if self._current_label is not None:
             self.end_label()
         if self._current_form is not None:
             self.end_form()
 
     def start_form(self, attrs):
+#        import pdb;pdb.set_trace()
+        debug("%s", attrs)
         if self._current_form is not None:
             raise ParseError("nested FORMs")
         name = None
@@ -468,6 +489,7 @@
         self._current_form = (name, action, method, enctype), d, controls
 
     def end_form(self):
+        debug("")
         if self._current_label is not None:
             self.end_label()
         if self._current_form is None:
@@ -476,6 +498,7 @@
         self._current_form = None
 
     def start_select(self, attrs):
+        debug("%s", attrs)
         if self._current_form is None:
             raise ParseError("start of SELECT before start of FORM")
         if self._select is not None:
@@ -492,6 +515,7 @@
         self._append_select_control({"__select": d})
 
     def end_select(self):
+        debug("")
         if self._current_form is None:
             raise ParseError("end of SELECT before start of FORM")
         if self._select is None:
@@ -503,6 +527,7 @@
         self._select = None
 
     def start_optgroup(self, attrs):
+        debug("%s", attrs)
         if self._select is None:
             raise ParseError("OPTGROUP outside of SELECT")
         d = {}
@@ -512,11 +537,13 @@
         self._optgroup = d
 
     def end_optgroup(self):
+        debug("")
         if self._optgroup is None:
             raise ParseError("end of OPTGROUP before start")
         self._optgroup = None
 
     def _start_option(self, attrs):
+        debug("%s", attrs)
         if self._select is None:
             raise ParseError("OPTION outside of SELECT")
         if self._option is not None:
@@ -533,6 +560,7 @@
             self._option["disabled"] = None
 
     def _end_option(self):
+        debug("")
         if self._option is None:
             raise ParseError("end of OPTION before start")
 
@@ -549,11 +577,13 @@
         self._option = None
 
     def _append_select_control(self, attrs):
+        debug("%s", attrs)
         controls = self._current_form[2]
         name = self._select.get("name")
         controls.append(("select", name, attrs))
 
     def start_textarea(self, attrs):
+        debug("%s", attrs)
         if self._current_form is None:
             raise ParseError("start of TEXTAREA before start of FORM")
         if self._textarea is not None:
@@ -568,6 +598,7 @@
         self._textarea = d
 
     def end_textarea(self):
+        debug("")
         if self._current_form is None:
             raise ParseError("end of TEXTAREA before start of FORM")
         if self._textarea is None:
@@ -578,6 +609,7 @@
         self._textarea = None
 
     def start_label(self, attrs):
+        debug("%s", attrs)
         if self._current_label:
             self.end_label()
         d = {}
@@ -591,6 +623,7 @@
         self._current_label = d
 
     def end_label(self):
+        debug("")
         label = self._current_label
         if label is None:
             # something is ugly in the HTML, but we're ignoring it
@@ -601,6 +634,7 @@
         del label["__taken"]
 
     def _add_label(self, d):
+        #debug("%s", d)
         if self._current_label is not None:
             if self._current_label["__taken"]:
                 self.end_label()  # be fuzzy
@@ -609,6 +643,7 @@
                 d["__label"] = self._current_label
 
     def handle_data(self, data):
+        debug("%s", data)
         if self._option is not None:
             # self._option is a dictionary of the OPTION element's HTML
             # attributes, but it has two special keys, one of which is the
@@ -632,6 +667,7 @@
             map[key] = map[key] + data
 
     def do_button(self, attrs):
+        debug("%s", attrs)
         if self._current_form is None:
             raise ParseError("start of BUTTON before start of FORM")
         d = {}
@@ -651,6 +687,7 @@
         controls.append((type, name, d))
 
     def do_input(self, attrs):
+        debug("%s", attrs)
         if self._current_form is None:
             raise ParseError("start of INPUT before start of FORM")
         d = {}
@@ -665,6 +702,7 @@
         controls.append((type, name, d))
 
     def do_isindex(self, attrs):
+        debug("%s", attrs)
         if self._current_form is None:
             raise ParseError("start of ISINDEX before start of FORM")
         d = {}
@@ -677,18 +715,20 @@
         controls.append(("isindex", None, d))
 
     def handle_entityref(self, name):
-        table = self._entitydefs
-        fullname = "&%s;" % name
-        if table.has_key(fullname):
-            self.handle_data(table[fullname])
-        else:
-            self.unknown_entityref(name)
-            return
+        #debug("%s", name)
+        self.handle_data(unescape(
+            '&%s;' % name, self._entitydefs, self._encoding))
 
+    def handle_charref(self, name):
+        #debug("%s", name)
+        self.handle_data(unescape_charref(name, self._encoding))
+
     def unescape_attr(self, name):
-        return unescape(name, self._entitydefs)
+        #debug("%s", name)
+        return unescape(name, self._entitydefs, self._encoding)
 
     def unescape_attrs(self, attrs):
+        #debug("%s", attrs)
         escaped_attrs = {}
         for key, val in attrs.items():
             try:
@@ -710,15 +750,15 @@
     import HTMLParser
 except ImportError:
     class XHTMLCompatibleFormParser:
-        def __init__(self, entitydefs=None):
+        def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
             raise ValueError("HTMLParser could not be imported")
 else:
     class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
         """Good for XHTML, bad for tolerance of incorrect HTML."""
         # thanks to Michael Howitz for this!
-        def __init__(self, entitydefs=None):
+        def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
             HTMLParser.HTMLParser.__init__(self)
-            _AbstractFormParser.__init__(self, entitydefs)
+            _AbstractFormParser.__init__(self, entitydefs, encoding)
 
         def start_option(self, attrs):
             _AbstractFormParser._start_option(self, attrs)
@@ -747,18 +787,6 @@
             else:
                 method()
 
-        # taken from sgmllib, with changes
-        def handle_charref(self, name):
-            try:
-                n = int(name)
-            except ValueError:
-                self.unknown_charref(name)
-                return
-            if not 0 <= n <= 255:
-                self.unknown_charref(name)
-                return
-            self.handle_data(chr(n))
-
         def unescape(self, name):
             # Use the entitydefs passed into constructor, not
             # HTMLParser.HTMLParser's entitydefs.
@@ -769,13 +797,10 @@
         def unescape_attrs_if_required(self, attrs):
             return attrs  # ditto
 
-import htmllib, formatter
-class FormParser(_AbstractFormParser, htmllib.HTMLParser):
-    """Good for tolerance of incorrect HTML, bad for XHTML."""
-    def __init__(self, entitydefs=None):
-        htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
-        _AbstractFormParser.__init__(self, entitydefs)
-
+import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
+class _AbstractSgmllibParser(_AbstractFormParser):
     def do_option(self, attrs):
         _AbstractFormParser._start_option(self, attrs)
 
@@ -784,19 +809,52 @@
     def unescape_attrs_if_required(self, attrs):
         return self.unescape_attrs(attrs)
 
+class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
+    """Good for tolerance of incorrect HTML, bad for XHTML."""
+    def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+        sgmllib.SGMLParser.__init__(self)
+        _AbstractFormParser.__init__(self, entitydefs, encoding)
+
+try:
+    if sys.version_info[:2] < (2, 2):
+        raise ImportError  # BeautifulSoup uses generators
+    import BeautifulSoup
+except ImportError:
+    pass
+else:
+    class _AbstractBSFormParser(_AbstractSgmllibParser):
+        bs_base_class = None
+        def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+            _AbstractFormParser.__init__(self, entitydefs, encoding)
+            self.bs_base_class.__init__(self)
+        def handle_data(self, data):
+            _AbstractFormParser.handle_data(self, data)
+            self.bs_base_class.handle_data(self, data)
+
+    class RobustFormParser(_AbstractBSFormParser, BeautifulSoup.BeautifulSoup):
+        """Tries to be highly tolerant of incorrect HTML."""
+        bs_base_class = BeautifulSoup.BeautifulSoup
+    class NestingRobustFormParser(_AbstractBSFormParser,
+                                  BeautifulSoup.ICantBelieveItsBeautifulSoup):
+        """Tries to be highly tolerant of incorrect HTML.
+
+        Different from RobustFormParser in that it more often guesses nesting
+        above missing end tags (see BeautifulSoup docs).
+
+        """
+        bs_base_class = BeautifulSoup.ICantBelieveItsBeautifulSoup
+
 #FormParser = XHTMLCompatibleFormParser  # testing hack
+#FormParser = RobustFormParser  # testing hack
 
-def get_entitydefs():
-    entitydefs = {}
-    for name, char in htmlentitydefs.entitydefs.items():
-        entitydefs["&%s;" % name] = char
-    return entitydefs
-
 def ParseResponse(response, select_default=False,
                   ignore_errors=False,  # ignored!
                   form_parser_class=FormParser,
                   request_class=urllib2.Request,
-                  entitydefs=None, backwards_compat=True):
+                  entitydefs=None,
+                  backwards_compat=True,
+                  encoding=DEFAULT_ENCODING,
+                  ):
     """Parse HTTP response and return a list of HTMLForm instances.
 
     The return value of urllib2.urlopen can be conveniently passed to this
@@ -811,11 +869,17 @@
     form_parser_class: class to instantiate and use to pass
     request_class: class to return from .click() method (default is
      urllib2.Request)
-    entitydefs: mapping like {'&amp;': '&', ...} containing HTML entity
+    entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
      definitions (a sensible default is used)
+    encoding: character encoding used for encoding numeric character references
+     when matching link text.  ClientForm does not attempt to find the encoding
+     in a META HTTP-EQUIV attribute in the document itself (mechanize, for
+     example, does do that and will pass the correct value to ClientForm using
+     this parameter).
 
     backwards_compat: boolean that determines whether the returned HTMLForm
-     objects are backwards-compatible with old code.  If backwards_compat is True:
+     objects are backwards-compatible with old code.  If backwards_compat is
+     true:
 
      - ClientForm 0.1 code will continue to work as before.
 
@@ -844,7 +908,7 @@
 
     There is a choice of parsers.  ClientForm.XHTMLCompatibleFormParser (uses
     HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
-    htmllib.HTMLParser) (the default) works best for ordinary grubby HTML.
+    sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
     Note that HTMLParser is only available in Python 2.2 and later.  You can
     pass your own class in here as a hack to work around bad HTML, but at your
     own risk: there is no well-defined interface.
@@ -854,13 +918,19 @@
                      False,
                      form_parser_class,
                      request_class,
-                     entitydefs, backwards_compat)
+                     entitydefs,
+                     backwards_compat,
+                     encoding,
+                     )
 
 def ParseFile(file, base_uri, select_default=False,
               ignore_errors=False,  # ignored!
               form_parser_class=FormParser,
               request_class=urllib2.Request,
-              entitydefs=None, backwards_compat=True):
+              entitydefs=None,
+              backwards_compat=True,
+              encoding=DEFAULT_ENCODING,
+              ):
     """Parse HTML and return a list of HTMLForm instances.
 
     ClientForm.ParseError is raised on parse errors.
@@ -876,7 +946,8 @@
     """
     if backwards_compat:
         deprecation("operating in backwards-compatibility mode")
-    fp = form_parser_class(entitydefs)
+#    import pdb;pdb.set_trace()
+    fp = form_parser_class(entitydefs, encoding)
     while 1:
         data = file.read(CHUNK)
         try:
@@ -916,8 +987,9 @@
             type, name, attrs = controls[ii]
             attrs = fp.unescape_attrs_if_required(attrs)
             name = fp.unescape_attr_if_required(name)
+            # index=ii*10 allows ImageControl to return multiple ordered pairs
             form.new_control(type, name, attrs, select_default=select_default,
-                             index=ii)
+                             index=ii*10)
         forms.append(form)
     for form in forms:
         form.fixup()
@@ -930,7 +1002,7 @@
         self._text = attrs.get("__text").strip()
         self._ctext = compress_text(self._text)
         self.attrs = attrs
-        self._backwards_compat = False  # maintaned by HTMLForm
+        self._backwards_compat = False  # maintained by HTMLForm
 
     def __getattr__(self, name):
         if name == "text":
@@ -942,15 +1014,15 @@
 
     def __setattr__(self, name, value):
         if name == "text":
-            # don't see any need for this
+            # don't see any need for this, so make it read-only
             raise AttributeError("text attribute is read-only")
         self.__dict__[name] = value
 
     def __str__(self):
-        return '<Label(id=%r, text=%r)>' % (self.id, self.text)
+        return "<Label(id=%r, text=%r)>" % (self.id, self.text)
 
 
-def _getLabel(attrs):
+def _get_label(attrs):
     text = attrs.get("__label")
     if text is not None:
         return Label(text)
@@ -1049,15 +1121,14 @@
         """
         raise NotImplementedError()
 
-    def _write_mime_data(self, mw):
-        """Write data for this control to a MimeWriter."""
+    def _write_mime_data(self, mw, name, value):
+        """Write data for a subitem of this control to a MimeWriter."""
         # called by HTMLForm
-        for name, value in self.pairs():
-            mw2 = mw.nextpart()
-            mw2.addheader("Content-disposition",
-                          'form-data; name="%s"' % name, 1)
-            f = mw2.startbody(prefix=0)
-            f.write(value)
+        mw2 = mw.nextpart()
+        mw2.addheader("Content-disposition",
+                      'form-data; name="%s"' % name, 1)
+        f = mw2.startbody(prefix=0)
+        f.write(value)
 
     def __str__(self):
         raise NotImplementedError()
@@ -1093,7 +1164,7 @@
     """
     def __init__(self, type, name, attrs, index=None):
         self._index = index
-        self._label = _getLabel(attrs)
+        self._label = _get_label(attrs)
         self.__dict__["type"] = type.lower()
         self.__dict__["name"] = name
         self._value = attrs.get("value")
@@ -1161,7 +1232,6 @@
 
     INPUT/TEXT
     INPUT/PASSWORD
-    INPUT/FILE
     INPUT/HIDDEN
     TEXTAREA
 
@@ -1219,8 +1289,9 @@
             return []
         return [(self._index, self.name, "")]
 
-    def _write_mime_data(self, mw):
+    def _write_mime_data(self, mw, _name, _value):
         # called by HTMLForm
+        # assert _name == self.name and _value == ''
         if len(self._upload_data) == 1:
             # single file
             file_object, content_type, filename = self._upload_data[0]
@@ -1381,7 +1452,7 @@
 
 class Item:
     def __init__(self, control, attrs, index=None):
-        label = _getLabel(attrs)
+        label = _get_label(attrs)
         self.__dict__.update({
             "name": attrs["value"],
             "_labels": label and [label] or [],
@@ -1793,7 +1864,7 @@
 
     def merge_control(self, control):
         assert bool(control.multiple) == bool(self.multiple)
-        #assert isinstance(control, self.__class__)
+        # usually, isinstance(control, self.__class__)
         self.items.extend(control.items)
 
     def fixup(self):
@@ -2084,6 +2155,9 @@
 
     SELECT (and OPTION)
 
+
+    OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
+
     SELECT control values and labels are subject to some messy defaulting
     rules.  For example, if the HTML representation of the control is:
 
@@ -2094,9 +2168,9 @@
     </SELECT>
 
     The items, in order, have labels "2002", "2001" and "2000", whereas their
-    values are "0", "1" and "2000" respectively.  Note that the value of the
-    last OPTION in this example defaults to its contents, as specified by RFC
-    1866, as do the labels of the second and third OPTIONs.
+    names (the OPTION values) are "0", "1" and "2000" respectively.  Note that
+    the value of the last OPTION in this example defaults to its contents, as
+    specified by RFC 1866, as do the labels of the second and third OPTIONs.
 
     The OPTION labels are sometimes more meaningful than the OPTION values,
     which can make for more maintainable code.
@@ -2106,14 +2180,13 @@
     The attrs attribute is a dictionary of the original HTML attributes of the
     SELECT element.  Other ListControls do not have this attribute, because in
     other cases the control as a whole does not correspond to any single HTML
-    element.  The get_item_attrs method may be used as usual to get at the
-    HTML attributes of the HTML elements corresponding to individual list items
-    (for SELECT controls, these are OPTION elements).
+    element.  control.get(...).attrs may be used as usual to get at the HTML
+    attributes of the HTML elements corresponding to individual list items (for
+    SELECT controls, these are OPTION elements).
 
-    Another special case is that the attributes dictionaries returned by
-    get_item_attrs have a special key "contents" which does not correspond to
-    any real HTML attribute, but rather contains the contents of the OPTION
-    element:
+    Another special case is that the Item.attrs dictionaries have a special key
+    "contents" which does not correspond to any real HTML attribute, but rather
+    contains the contents of the OPTION element:
 
     <OPTION>this bit</OPTION>
 
@@ -2136,7 +2209,7 @@
         # fish out the SELECT HTML attributes from the OPTION HTML attributes
         # dictionary
         self.attrs = attrs["__select"].copy()
-        self.__dict__["_label"] = _getLabel(self.attrs)
+        self.__dict__["_label"] = _get_label(self.attrs)
         self.__dict__["id"] = self.attrs.get("id")
         self.__dict__["multiple"] = self.attrs.has_key("multiple")
         # the majority of the contents, label, and value dance already happened
@@ -2169,14 +2242,19 @@
     def fixup(self):
         ListControl.fixup(self)
         # Firefox doesn't exclude disabled items from those considered here
-        # (i.e. from 'found', for both brances of the if below).  Note that IE
-        # doesn't support the disabled attribute on OPTIONs at all.
+        # (i.e. from 'found', for both branches of the if below).  Note that
+        # IE6 doesn't support the disabled attribute on OPTIONs at all.
         found = [o for o in self.items if o.selected]
         if not found:
             if not self.multiple or self._select_default:
                 for o in self.items:
                     if not o.disabled:
-                        o.selected = True
+                        was_disabled = self.disabled
+                        self.disabled = False
+                        try:
+                            o.selected = True
+                        finally:
+                            o.disabled = was_disabled
                         break
         elif not self.multiple:
             # Ensure only one item selected.  Choose the last one,
@@ -2245,11 +2323,11 @@
         if name is None: return []
         pairs = [
             (self._index, "%s.x" % name, str(clicked[0])),
-            (self._index, "%s.y" % name, str(clicked[1])),
+            (self._index+1, "%s.y" % name, str(clicked[1])),
             ]
         value = self._value
         if value:
-            pairs.append((self._index, name, value))
+            pairs.append((self._index+2, name, value))
         return pairs
 
     get_labels = ScalarControl.get_labels
@@ -2301,8 +2379,10 @@
     need to be more specific than just supplying the control's name, use the
     set_value and get_value methods.
 
-    ListControl values are lists of item names.  The list item's name is the
-    value of the corresponding HTML element's "value" attribute.
+    ListControl values are lists of item names (specifically, the names of the
+    items that are selected and not disabled, and hence are "successful" -- ie.
+    cause data to be returned to the server).  The list item's name is the
+    value of the corresponding HTML element's"value" attribute.
 
     Example:
 
@@ -2321,11 +2401,12 @@
 
     defines a SELECT control with name "more_cheeses" which has two items,
     named "1" and "2" (because the OPTION element's value HTML attribute
-    defaults to the element contents).
+    defaults to the element contents -- see SelectControl.__doc__ for more on
+    these defaulting rules).
 
     To select, deselect or otherwise manipulate individual list items, use the
     HTMLForm.find_control() and ListControl.get() methods.  To set the whole
-    value, do as for any other control:use indexing or the set_/get_value
+    value, do as for any other control: use indexing or the set_/get_value
     methods.
 
     Example:
@@ -2611,7 +2692,9 @@
 
 #---------------------------------------------------
     def __str__(self):
-        header = "%s %s %s" % (self.method, self.action, self.enctype)
+        header = "%s%s %s %s" % (
+            (self.name and self.name+" " or ""),
+            self.method, self.action, self.enctype)
         rep = [header]
         for control in self.controls:
             rep.append("  %s" % str(control))
@@ -3054,17 +3137,23 @@
 
     def _pairs(self):
         """Return sequence of (key, value) pairs suitable for urlencoding."""
-        opairs = []
-        for control in self.controls:
-            opairs.extend(control._totally_ordered_pairs())
+        return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
 
+
+    def _pairs_and_controls(self):
+        """Return sequence of (index, key, value, control_index)
+        of totally ordered pairs suitable for urlencoding.
+
+        control_index is the index of the control in self.controls
+        """
+        pairs = []
+        for control_index in range(len(self.controls)):
+            control = self.controls[control_index]
+            for ii, key, val in control._totally_ordered_pairs():
+                pairs.append((ii, key, val, control_index))
+
         # stable sort by ONLY first item in tuple
-        sorter = []
-        for jj in range(len(opairs)):
-            ii, key, val = opairs[jj]
-            sorter.append((ii, jj, key, val))
-        sorter.sort()
-        pairs = [(key, val) for (ii, jj, key, val) in sorter]
+        pairs.sort()
 
         return pairs
 
@@ -3094,8 +3183,8 @@
                 mw = MimeWriter(data, http_hdrs)
                 f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
                                           prefix=0)
-                for control in self.controls:
-                    control._write_mime_data(mw)
+                for ii, k, v, control_index in self._pairs_and_controls():
+                    self.controls[control_index]._write_mime_data(mw, k, v)
                 mw.lastpart()
                 return uri, data.getvalue(), http_hdrs
             else:
@@ -3116,7 +3205,7 @@
             req = request_class(req_data[0], req_data[1])
             for key, val in req_data[2]:
                 add_hdr = req.add_header
-                if key.lower() == 'content-type':
+                if key.lower() == "content-type":
                     try:
                         add_hdr = req.add_unredirected_header
                     except AttributeError:

Modified: Zope3/branches/benji-integrate-new-mechanize/src/pullparser.py
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/pullparser.py	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/pullparser.py	2006-06-19 15:22:02 UTC (rev 68754)
@@ -25,11 +25,11 @@
     print "Title: %s" % title
 
 
-Copyright 2003-2004 John J. Lee <jjl at pobox.com>
+Copyright 2003-2006 John J. Lee <jjl at pobox.com>
 Copyright 1998-2001 Gisle Aas (original libwww-perl code)
 
 This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License.
+under the terms of the BSD or ZPL 2.1 licenses.
 
 """
 
@@ -38,8 +38,9 @@
 import re, htmlentitydefs
 import HTMLParser
 
-__version__ = (0, 0, 6, None, None)  # 0.0.6b
+__version__ = (0, 1, 0, None, None)  # 0.1.0
 
+
 class NoMoreTokensError(Exception): pass
 
 class Token:
@@ -100,26 +101,63 @@
         import sys
     return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
 
-def unescape(data, entities):
-    if data is None or '&' not in data:
+def unescape(data, entities, encoding):
+    if data is None or "&" not in data:
         return data
+
     def replace_entities(match):
         ent = match.group()
-        repl = entities.get(ent, ent)
+        if ent[1] == "#":
+            return unescape_charref(ent[2:-1], encoding)
+
+        repl = entities.get(ent)
+        if repl is not None:
+            if type(repl) != type(""):
+                try:
+                    repl = repl.encode(encoding)
+                except UnicodeError:
+                    repl = ent
+        else:
+            repl = ent
+
         return repl
-    return re.sub(r'&\S+?;', replace_entities, data)
 
+    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+    name, base = data, 10
+    if name.startswith("x"):
+        name, base= name[1:], 16
+    uc = unichr(int(name, base))
+    if encoding is None:
+        return uc
+    else:
+        try:
+            repl = uc.encode(encoding)
+        except UnicodeError:
+            repl = "&#%s;" % data
+        return repl
+
 def get_entitydefs():
     entitydefs = {}
-    for name, char in htmlentitydefs.entitydefs.items():
-        entitydefs["&%s;" % name] = char
+    try:
+        htmlentitydefs.name2codepoint
+    except AttributeError:
+        entitydefs = {}
+        for name, char in htmlentitydefs.entitydefs.items():
+            uc = char.decode("latin-1")
+            if uc.startswith("&#") and uc.endswith(";"):
+                uc = unescape_charref(uc[2:-1], None)
+            entitydefs["&%s;" % name] = uc
+    else:
+        for name, codepoint in htmlentitydefs.name2codepoint.items():
+            entitydefs["&%s;" % name] = unichr(codepoint)
     return entitydefs
 
 
 class _AbstractParser:
     chunk = 1024
     compress_re = re.compile(r"\s+")
-    entitydefs = htmlentitydefs.entitydefs
     def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
                  encoding="ascii", entitydefs=None):
         """
@@ -129,9 +167,17 @@
          to represent opening tags as text
         encoding: encoding used to encode numeric character references by
          .get_text() and .get_compressed_text() ("ascii" by default)
-        entitydefs: mapping like {'&amp;': '&', ...} containing HTML entity
-         definitions (a sensible default is used)
 
+        entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
+         definitions (a sensible default is used).  This is used to unescape
+         entities in .get_text() (and .get_compressed_text()) and attribute
+         values.  If the encoding can not represent the character, the entity
+         reference is left unescaped.  Note that entity references (both
+         numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
+         unescaped in attribute values and the return value of .get_text(), but
+         not in data outside of tags.  Instead, entity references outside of
+         tags are represented as tokens.  This is a bit odd, it's true :-/
+
         If the element name of an opening tag matches a key in the textify
         mapping then that tag is converted to text.  The corresponding value is
         used to specify which tag attribute to obtain the text from.  textify
@@ -237,10 +283,10 @@
         .get_tag() first (unless you want an empty string returned when you
         next call .get_text()).
 
-        Entity references are translated using the entitydefs attribute (a
-        mapping from names to characters like that provided by the standard
-        module htmlentitydefs).  Named entity references that are not in this
-        mapping are left unchanged.
+        Entity references are translated using the value of the entitydefs
+        constructor argument (a mapping from names to characters like that
+        provided by the standard module htmlentitydefs).  Named entity
+        references that are not in this mapping are left unchanged.
 
         The textify attribute is used to translate opening tags into text: see
         the class docstring.
@@ -258,17 +304,10 @@
             if tok.type == "data":
                 text.append(tok.data)
             elif tok.type == "entityref":
-                name = tok.data
-                if name in self.entitydefs:
-                    t = self.entitydefs[name]
-                else:
-                    t = "&%s;" % name
+                t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
                 text.append(t)
             elif tok.type == "charref":
-                name, base = tok.data, 10
-                if name.startswith('x'):
-                    name, base= name[1:], 16
-                t = unichr(int(name, base)).encode(self.encoding)
+                t = unescape_charref(tok.data, self.encoding)
                 text.append(t)
             elif tok.type in ["starttag", "endtag", "startendtag"]:
                 tag_name = tok.data
@@ -322,7 +361,7 @@
         self._tokenstack.append(Token("pi", data))
 
     def unescape_attr(self, name):
-        return unescape(name, self._entitydefs)
+        return unescape(name, self._entitydefs, self.encoding)
     def unescape_attrs(self, attrs):
         escaped_attrs = []
         for key, val in attrs:
@@ -339,6 +378,8 @@
         return self.unescape_attr(name)
 
 import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
     def __init__(self, *args, **kwds):
         sgmllib.SGMLParser.__init__(self)

Modified: Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/browser/README.txt
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/browser/README.txt	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/browser/README.txt	2006-06-19 15:22:02 UTC (rev 68754)
@@ -26,7 +26,7 @@
   ... except HTTPError, error:
   ...     pass
 
-  >>> print error.read()
+  >>> print error.read() # XXX error here
   <...
   <h1 class="details-header">
     Page Not Found

Modified: Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/codemodule/browser/introspector.txt
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/codemodule/browser/introspector.txt	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/zope/app/apidoc/codemodule/browser/introspector.txt	2006-06-19 15:22:02 UTC (rev 68754)
@@ -12,7 +12,7 @@
     >>> browser.handleErrors = False
 
     >>> browser.open('http://localhost/manage')
-    >>> browser.getLink('Introspector').click()
+    >>> browser.getLink('Introspector').click() # XXX error here
 
 The page starts with telling you the class/type
 

Modified: Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/browser.py
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/browser.py	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/browser.py	2006-06-19 15:22:02 UTC (rev 68754)
@@ -217,17 +217,28 @@
         """See zope.testbrowser.interfaces.IBrowser"""
         self._start_timer()
         try:
-            self.mech_browser.open(url, data)
-        except urllib2.HTTPError, e:
-            if e.code >= 200 and e.code <= 299:
-                # 200s aren't really errors
-                pass
-            else:
-                raise
+            try:
+                self.mech_browser.open(url, data)
+            except urllib2.HTTPError, e:
+                if e.code >= 200 and e.code <= 299:
+                    # 200s aren't really errors
+                    pass
+                else:
+                    raise
+        finally:
+            self._stop_timer()
+            self._changed()
 
-        self._stop_timer()
-        self._changed()
+        # if the headers don't have a status, I suppose there can't be an error
+        if 'Status' in self.headers:
+            code, msg = self.headers['Status'].split(' ', 1)
+            code = int(code)
+            fp = self.mech_browser.open(self.mech_browser.request)
+            if code >= 400:
+                raise urllib2.HTTPError(url, code, msg, self.headers, fp)
 
+            if code >= 300: import pdb;pdb.set_trace()
+
     def _start_timer(self):
         self.timer.start()
 
@@ -290,22 +301,31 @@
         matches = re.compile(r'(^|\b|\W)%s(\b|\W|$)'
                              % re.escape(compressText(label))).search
         found = []
+#        import pdb;pdb.set_trace()
         for f in forms:
             for control in f.controls:
                 phantom = control.type in ('radio', 'checkbox')
                 if include_subcontrols and (
+                    phantom or control.type=='select'):
 
-                    phantom or control.type=='select'):
+                    found_one = False
                     for i in control.items:
                         for l in i.get_labels():
                             if matches(l.text):
                                 found.append((i, f))
+                                found_one = True
                                 break
+
+                    if found_one:
+                        del found_one
+                        continue
+
                 if not phantom:
                     for l in control.get_labels():
                         if matches(l.text):
                             found.append((control, f))
                             break
+#        if len(found) > 1: import pdb;pdb.set_trace()
         return found
 
     def _findByName(self, name, forms):

Modified: Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/testing.py
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/testing.py	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/testing.py	2006-06-19 15:22:02 UTC (rev 68754)
@@ -127,30 +127,30 @@
 class PublisherMechanizeBrowser(mechanize.Browser):
     """Special ``mechanize`` browser using the Zope Publisher HTTP handler."""
 
-    handler_classes = {
-        # scheme handlers
-        "http": PublisherHTTPHandler,
-
-        "_http_error": ClientCookie.HTTPErrorProcessor,
-        "_http_request_upgrade": ClientCookie.HTTPRequestUpgradeProcessor,
-        "_http_default_error": urllib2.HTTPDefaultErrorHandler,
-
-        # feature handlers
-        "_authen": urllib2.HTTPBasicAuthHandler,
-        "_redirect": ClientCookie.HTTPRedirectHandler,
-        "_cookies": ClientCookie.HTTPCookieProcessor,
-        "_refresh": ClientCookie.HTTPRefreshProcessor,
-        "_referer": mechanize.Browser.handler_classes['_referer'],
-        "_equiv": ClientCookie.HTTPEquivProcessor,
-        "_seek": ClientCookie.SeekableProcessor,
-        }
-
     default_schemes = ["http"]
     default_others = ["_http_error", "_http_request_upgrade",
                       "_http_default_error"]
     default_features = ["_authen", "_redirect", "_cookies", "_seek"]
 
+    default_features = ["_redirect", "_cookies", "_referer",
+                        "_refresh", "_equiv",
+                        "_basicauth", "_digestauth",
+                        "_seek",
+                        ]
 
+    def __init__(self, *args, **kws):
+        inherited_handlers = ['_unknown', '_http_error',
+            '_http_request_upgrade', '_http_default_error', '_basicauth',
+            '_digestauth', '_authen', '_redirect', '_cookies', '_referer',
+            '_refresh', '_equiv', '_seek', '_gzip']
+
+        self.handler_classes = {"http": PublisherHTTPHandler}
+        for name in inherited_handlers:
+            self.handler_classes[name] = mechanize.Browser.handler_classes[name]
+
+        mechanize.Browser.__init__(self, *args, **kws)
+
+
 class Browser(browser.Browser):
     """A Zope `testbrowser` Browser that uses the Zope Publisher."""
 

Modified: Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/tests.py
===================================================================
--- Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/tests.py	2006-06-19 15:03:17 UTC (rev 68753)
+++ Zope3/branches/benji-integrate-new-mechanize/src/zope/testbrowser/tests.py	2006-06-19 15:22:02 UTC (rev 68754)
@@ -222,6 +222,8 @@
 checker = renormalizing.RENormalizing([
     (re.compile(r'^--\S+\.\S+\.\S+', re.M), '-'*30),
     (re.compile(r'boundary=\S+\.\S+\.\S+'), 'boundary='+'-'*30),
+    (re.compile(r'^---{10}.*', re.M), '-'*30),
+    (re.compile(r'boundary=-{10}.*'), 'boundary='+'-'*30),
     (re.compile('User-agent:\s+\S+'), 'User-agent: XXX'),
     (re.compile('Content-length:\s+\S+'), 'Content-length: 123'),
     ])