[Zope3-checkins] SVN: Zope3/trunk/src/ This is an intermediate
checkin,
to get the testbrowser code closer to our desired end-state.
Gary Poster
gary at zope.com
Tue Nov 1 12:34:15 EST 2005
Log message for revision 39818:
This is an intermediate checkin, to get the testbrowser code closer to our desired end-state.
Positive changes: we are using the current version of the testbrowser dependencies. We are not hacking them in with sys.modules (they no longer could be, actually). We are not hacking the dependencies any more. The goBack bug should be fixed. The dependencies should pass all their tests (in their own packages, not yet imported). The memory problem Stephan found and fixed in our hacked up copies should still be fixed, but now using the dependency code that we are not maintaining.
Negative changes: we have some modules (not packages) in the src directory; Fred and Jim agree that's the best we can do ATM, but we're hopeful that some work that Philipp is doing can make this prettier. The tests for the dependencies are still not included. The dependencies now appear to themselves have a dependency on Python being built with ssl support. These issues are going to be discussed further and hopefully addressed.
Changed:
A Zope3/trunk/src/ClientCookie/
A Zope3/trunk/src/ClientCookie/_BSDDBCookieJar.py
A Zope3/trunk/src/ClientCookie/_ClientCookie.py
A Zope3/trunk/src/ClientCookie/_ConnCache.py
A Zope3/trunk/src/ClientCookie/_Debug.py
A Zope3/trunk/src/ClientCookie/_HeadersUtil.py
A Zope3/trunk/src/ClientCookie/_LWPCookieJar.py
A Zope3/trunk/src/ClientCookie/_MSIECookieJar.py
A Zope3/trunk/src/ClientCookie/_MSIEDBCookieJar.py
A Zope3/trunk/src/ClientCookie/_MozillaCookieJar.py
A Zope3/trunk/src/ClientCookie/_Opener.py
A Zope3/trunk/src/ClientCookie/_Request.py
A Zope3/trunk/src/ClientCookie/_Util.py
A Zope3/trunk/src/ClientCookie/__init__.py
A Zope3/trunk/src/ClientCookie/_urllib2_support.py
A Zope3/trunk/src/ClientForm.py
A Zope3/trunk/src/mechanize/
A Zope3/trunk/src/mechanize/__init__.py
A Zope3/trunk/src/mechanize/_mechanize.py
A Zope3/trunk/src/mechanize/_useragent.py
A Zope3/trunk/src/pullparser.py
D Zope3/trunk/src/zope/testbrowser/BUGFIXES.txt
D Zope3/trunk/src/zope/testbrowser/ClientCookie/
D Zope3/trunk/src/zope/testbrowser/ClientForm.py
U Zope3/trunk/src/zope/testbrowser/__init__.py
U Zope3/trunk/src/zope/testbrowser/browser.py
D Zope3/trunk/src/zope/testbrowser/mechanize/
D Zope3/trunk/src/zope/testbrowser/pullparser.py
-=-
Added: Zope3/trunk/src/ClientCookie/_BSDDBCookieJar.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_BSDDBCookieJar.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_BSDDBCookieJar.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,180 @@
+"""Persistent CookieJar based on bsddb standard library module.
+
+Copyright 2003-2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+**********************************************************************
+THIS IS NOT FULLY TESTED!
+**********************************************************************
+
+"""
+
+from _ClientCookie import CookieJar, MappingIterator
+from _Debug import getLogger
+debug = getLogger("ClientCookie").debug
+
+import bsddb
+import cPickle
+pickle = cPickle
+del cPickle
+
+try: StopIteration
+except NameError:
+ from _ClientCookie import StopIteration
+
+def CreateBSDDBCookieJar(filename, policy=None):
+ """Return a BSDDBCookieJar given a BSDDB filename.
+
+ Use this unless rather than directly using the BSDDBCookieJar constructor
+ unless you know what you're doing.
+
+ filename: filename for sleepycat BSDDB database; if the file doesn't exist,
+ it will be created; otherwise, it will be opened
+
+ **********************************************************************
+ BSDDBCookieJar IS NOT FULLY TESTED!
+ **********************************************************************
+
+ """
+ db = bsddb.db.DB()
+ db.open(filename, bsddb.db.DB_HASH, bsddb.db.DB_CREATE, 0666)
+ return BSDDBCookieJar(policy, db)
+
+class BSDDBIterator:
+ # XXXX should this use thread lock?
+ def __init__(self, cursor):
+ iterator = None
+ self._c = cursor
+ self._i = iterator
+ def __iter__(self): return self
+ def close(self):
+ if self._c is not None:
+ self._c.close()
+ self._c = self._i = self.next = self.__iter__ = None
+ def next(self):
+ while 1:
+ if self._i is None:
+ item = self._c.next()
+ if item is None:
+ self.close()
+ raise StopIteration()
+ domain, data = item
+ self._i = MappingIterator(pickle.loads(data))
+ try:
+ return self._i.next()
+ except StopIteration:
+ self._i = None
+ continue
+ def __del__(self):
+ # XXXX will this work?
+ self.close()
+
+class BSDDBCookieJar(CookieJar):
+ """CookieJar based on a BSDDB database, using the standard bsddb module.
+
+ You should use CreateBSDDBCookieJar instead of the constructor, unless you
+ know what you're doing.
+
+ Note that session cookies ARE stored in the database (marked as session
+ cookies), and will be written to disk if the database is file-based. In
+ order to clear session cookies at the end of a session, you must call
+ .clear_session_cookies().
+
+ Call the .close() method after you've finished using an instance of this
+ class.
+
+ **********************************************************************
+ THIS IS NOT FULLY TESTED!
+ **********************************************************************
+
+ """
+ # XXX
+ # use transactions to make multiple reader processes possible
+ def __init__(self, policy=None, db=None):
+ CookieJar.__init__(self, policy)
+ del self._cookies
+ if db is None:
+ db = bsddb.db.DB()
+ self._db = db
+ def close(self):
+ self._db.close()
+ def __del__(self):
+ # XXXX will this work?
+ self.close()
+ def clear(self, domain=None, path=None, name=None):
+ if name is not None:
+ if (domain is None) or (path is None):
+ raise ValueError(
+ "domain and path must be given to remove a cookie by name")
+ elif path is not None:
+ if domain is None:
+ raise ValueError(
+ "domain must be given to remove cookies by path")
+
+ db = self._db
+ self._cookies_lock.acquire()
+ try:
+ if domain is not None:
+ data = db.get(domain)
+ if data is not None:
+ if path is name is None:
+ db.delete(domain)
+ else:
+ c2 = pickle.loads(data)
+ if name is None:
+ del c2[path]
+ else:
+ del c2[path][name]
+ else:
+ raise KeyError("no domain '%s'" % domain)
+ finally:
+ self._cookies_lock.release()
+ def set_cookie(self, cookie):
+ db = self._db
+ self._cookies_lock.acquire()
+ try:
+ # store 2-level dict under domain, like {path: {name: value}}
+ data = db.get(cookie.domain)
+ if data is None:
+ c2 = {}
+ else:
+ c2 = pickle.loads(data)
+ if not c2.has_key(cookie.path): c2[cookie.path] = {}
+ c3 = c2[cookie.path]
+ c3[cookie.name] = cookie
+ db.put(cookie.domain, pickle.dumps(c2))
+ finally:
+ self._cookies_lock.release()
+ def __iter__(self):
+ return BSDDBIterator(self._db.cursor())
+ def _cookies_for_request(self, request):
+ """Return a list of cookies to be returned to server."""
+ cookies = []
+ for domain in self._db.keys():
+ cookies.extend(self._cookies_for_domain(domain, request))
+ return cookies
+ def _cookies_for_domain(self, domain, request, unverifiable):
+ debug("Checking %s for cookies to return", domain)
+ if not self._policy.domain_return_ok(domain, request, unverifiable):
+ return []
+
+ data = self._db.get(domain)
+ if data is None:
+ return []
+ cookies_by_path = pickle.loads(data)
+
+ cookies = []
+ for path in cookies_by_path.keys():
+ if not self._policy.path_return_ok(path, request, unverifiable):
+ continue
+ for name, cookie in cookies_by_path[path].items():
+ if not self._policy.return_ok(cookie, request, unverifiable):
+ debug(" not returning cookie")
+ continue
+ debug(" it's a match")
+ cookies.append(cookie)
+
+ return cookies
Added: Zope3/trunk/src/ClientCookie/_ClientCookie.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_ClientCookie.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_ClientCookie.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,1687 @@
+"""HTTP cookie handling for web clients, plus some other stuff.
+
+This module originally developed from my port of Gisle Aas' Perl module
+HTTP::Cookies, from the libwww-perl library.
+
+Docstrings, comments and debug strings in this code refer to the
+attributes of the HTTP cookie system as cookie-attributes, to distinguish
+them clearly from Python attributes.
+
+ CookieJar____
+ / \ \
+ FileCookieJar \ \
+ / | \ \ \
+ MozillaCookieJar | LWPCookieJar \ \
+ | | \
+ | ---MSIEBase | \
+ | / | | \
+ | / MSIEDBCookieJar BSDDBCookieJar
+ |/
+ MSIECookieJar
+
+Comments to John J Lee <jjl at pobox.com>.
+
+
+Copyright 2002-2005 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+Copyright 2002-2003 Johnny Lee (original MSIE Perl code)
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+VERSION = "1.0.3"
+
+
+# Public health warning: anyone who thought 'cookies are simple, aren't they?',
+# run away now :-(
+
+import sys, re, urlparse, string, copy, time, struct, urllib, types
+try:
+ import threading
+ _threading = threading; del threading
+except ImportError:
+ import dummy_threading
+ _threading = dummy_threading; del dummy_threading
+import httplib # only for the default HTTP port
+
+MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
+ "instance initialised with one)")
+DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+try:
+ from types import UnicodeType
+except ImportError:
+ UNICODE = False
+else:
+ UNICODE = True
+
+try: StopIteration
+except NameError:
+ class StopIteration(Exception): pass
+
+import ClientCookie
+from _HeadersUtil import split_header_words, parse_ns_headers
+from _Util import startswith, endswith, isstringlike, getheaders
+from _Debug import warn, getLogger
+debug = getLogger("ClientCookie.cookies").debug
+
+try: bool
+except NameError:
+ def bool(expr):
+ if expr: return True
+ else: return False
+
+try: issubclass(Exception, (Exception,))
+except TypeError:
+ real_issubclass = issubclass
+ from _Util import compat_issubclass
+ issubclass = compat_issubclass
+ del compat_issubclass
+
+def reraise_unmasked_exceptions(unmasked=()):
+ # There are a few catch-all except: statements in this module, for
+ # catching input that's bad in unexpected ways.
+ # This function re-raises some exceptions we don't want to trap.
+ if not ClientCookie.USE_BARE_EXCEPT:
+ raise
+ unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
+ etype = sys.exc_info()[0]
+ if issubclass(etype, unmasked):
+ raise
+ # swallowed an exception
+ import traceback, StringIO
+ f = StringIO.StringIO()
+ traceback.print_exc(None, f)
+ msg = f.getvalue()
+ warn("ClientCookie bug!\n%s" % msg)
+
+
+IPV4_RE = re.compile(r"\.\d+$")
+def is_HDN(text):
+ """Return True if text is a host domain name."""
+ # XXX
+ # This may well be wrong. Which RFC is HDN defined in, if any (for
+ # the purposes of RFC 2965)?
+ # For the current implementation, what about IPv6? Remember to look
+ # at other uses of IPV4_RE also, if change this.
+ return not (IPV4_RE.search(text) or
+ text == "" or
+ text[0] == "." or text[-1] == ".")
+
+def domain_match(A, B):
+ """Return True if domain A domain-matches domain B, according to RFC 2965.
+
+ A and B may be host domain names or IP addresses.
+
+ RFC 2965, section 1:
+
+ Host names can be specified either as an IP address or a HDN string.
+ Sometimes we compare one host name with another. (Such comparisons SHALL
+ be case-insensitive.) Host A's name domain-matches host B's if
+
+ * their host name strings string-compare equal; or
+
+ * A is a HDN string and has the form NB, where N is a non-empty
+ name string, B has the form .B', and B' is a HDN string. (So,
+ x.y.com domain-matches .Y.com but not Y.com.)
+
+ Note that domain-match is not a commutative operation: a.b.c.com
+ domain-matches .c.com, but not the reverse.
+
+ """
+ # Note that, if A or B are IP addresses, the only relevant part of the
+ # definition of the domain-match algorithm is the direct string-compare.
+ A = string.lower(A)
+ B = string.lower(B)
+ if A == B:
+ return True
+ if not is_HDN(A):
+ return False
+ i = string.rfind(A, B)
+ has_form_nb = not (i == -1 or i == 0)
+ return (
+ has_form_nb and
+ startswith(B, ".") and
+ is_HDN(B[1:])
+ )
+
+def liberal_is_HDN(text):
+ """Return True if text is a sort-of-like a host domain name.
+
+ For accepting/blocking domains.
+
+ """
+ return not IPV4_RE.search(text)
+
+def user_domain_match(A, B):
+ """For blocking/accepting domains.
+
+ A and B may be host domain names or IP addresses.
+
+ """
+ A = string.lower(A)
+ B = string.lower(B)
+ if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
+ if A == B:
+ # equal IP addresses
+ return True
+ return False
+ initial_dot = startswith(B, ".")
+ if initial_dot and endswith(A, B):
+ return True
+ if not initial_dot and A == B:
+ return True
+ return False
+
+cut_port_re = re.compile(r":\d+$")
+def request_host(request):
+ """Return request-host, as defined by RFC 2965.
+
+ Variation from RFC: returned value is lowercased, for convenient
+ comparison.
+
+ """
+ url = request.get_full_url()
+ host = urlparse.urlparse(url)[1]
+ if host == "":
+ host = request.get_header("Host", "")
+
+ # remove port, if present
+ host = cut_port_re.sub("", host, 1)
+ return string.lower(host)
+
+def eff_request_host(request):
+ """Return a tuple (request-host, effective request-host name).
+
+ As defined by RFC 2965, except both are lowercased.
+
+ """
+ erhn = req_host = request_host(request)
+ if string.find(req_host, ".") == -1 and not IPV4_RE.search(req_host):
+ erhn = req_host + ".local"
+ return req_host, erhn
+
+def request_path(request):
+ """request-URI, as defined by RFC 2965."""
+ url = request.get_full_url()
+ #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
+ #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
+ path, parameters, query, frag = urlparse.urlparse(url)[2:]
+ if parameters:
+ path = "%s;%s" % (path, parameters)
+ path = escape_path(path)
+ req_path = urlparse.urlunparse(("", "", path, "", query, frag))
+ if not startswith(req_path, "/"):
+ # fix bad RFC 2396 absoluteURI
+ req_path = "/"+req_path
+ return req_path
+
+def request_port(request):
+ host = request.get_host()
+ i = string.find(host, ':')
+ if i >= 0:
+ port = host[i+1:]
+ try:
+ int(port)
+ except ValueError:
+ debug("nonnumeric port: '%s'", port)
+ return None
+ else:
+ port = DEFAULT_HTTP_PORT
+ return port
+
+# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
+# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
+HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
+ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
+def uppercase_escaped_char(match):
+ return "%%%s" % string.upper(match.group(1))
+def escape_path(path):
+ """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
+ # There's no knowing what character encoding was used to create URLs
+ # containing %-escapes, but since we have to pick one to escape invalid
+ # path characters, we pick UTF-8, as recommended in the HTML 4.0
+ # specification:
+ # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
+ # And here, kind of: draft-fielding-uri-rfc2396bis-03
+ # (And in draft IRI specification: draft-duerst-iri-05)
+ # (And here, for new URI schemes: RFC 2718)
+ if UNICODE and isinstance(path, types.UnicodeType):
+ path = path.encode("utf-8")
+ path = urllib.quote(path, HTTP_PATH_SAFE)
+ path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
+ return path
+
+def reach(h):
+ """Return reach of host h, as defined by RFC 2965, section 1.
+
+ The reach R of a host name H is defined as follows:
+
+ * If
+
+ - H is the host domain name of a host; and,
+
+ - H has the form A.B; and
+
+ - A has no embedded (that is, interior) dots; and
+
+ - B has at least one embedded dot, or B is the string "local".
+ then the reach of H is .B.
+
+ * Otherwise, the reach of H is H.
+
+ >>> reach("www.acme.com")
+ '.acme.com'
+ >>> reach("acme.com")
+ 'acme.com'
+ >>> reach("acme.local")
+ '.local'
+
+ """
+ i = string.find(h, ".")
+ if i >= 0:
+ #a = h[:i] # this line is only here to show what a is
+ b = h[i+1:]
+ i = string.find(b, ".")
+ if is_HDN(h) and (i >= 0 or b == "local"):
+ return "."+b
+ return h
+
+def is_third_party(request):
+ """
+
+ RFC 2965, section 3.3.6:
+
+ An unverifiable transaction is to a third-party host if its request-
+ host U does not domain-match the reach R of the request-host O in the
+ origin transaction.
+
+ """
+ req_host = request_host(request)
+ # the origin request's request-host was stuffed into request by
+ # _urllib2_support.AbstractHTTPHandler
+ return not domain_match(req_host, reach(request.origin_req_host))
+
+
+class Cookie:
+ """HTTP Cookie.
+
+ This class represents both Netscape and RFC 2965 cookies.
+
+ This is deliberately a very simple class. It just holds attributes. It's
+ possible to construct Cookie instances that don't comply with the cookie
+ standards. CookieJar.make_cookies is the factory function for Cookie
+ objects -- it deals with cookie parsing, supplying defaults, and
+ normalising to the representation used in this class. CookiePolicy is
+ responsible for checking them to see whether they should be accepted from
+ and returned to the server.
+
+ version: integer;
+ name: string;
+ value: string (may be None);
+ port: string; None indicates no attribute was supplied (eg. "Port", rather
+ than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list
+ string (eg. "80,8080")
+ port_specified: boolean; true if a value was supplied with the Port
+ cookie-attribute
+ domain: string;
+ domain_specified: boolean; true if Domain was explicitly set
+ domain_initial_dot: boolean; true if Domain as set in HTTP header by server
+ started with a dot (yes, this really is necessary!)
+ path: string;
+ path_specified: boolean; true if Path was explicitly set
+ secure: boolean; true if should only be returned over secure connection
+ expires: integer; seconds since epoch (RFC 2965 cookies should calculate
+ this value from the Max-Age attribute)
+ discard: boolean, true if this is a session cookie; (if no expires value,
+ this should be true)
+ comment: string;
+ comment_url: string;
+ rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not
+ Set-Cookie2:) header, but had a version cookie-attribute of 1
+ rest: mapping of other cookie-attributes
+
+ Note that the port may be present in the headers, but unspecified ("Port"
+ rather than"Port=80", for example); if this is the case, port is None.
+
+ """
+
+ def __init__(self, version, name, value,
+ port, port_specified,
+ domain, domain_specified, domain_initial_dot,
+ path, path_specified,
+ secure,
+ expires,
+ discard,
+ comment,
+ comment_url,
+ rest,
+ rfc2109=False,
+ ):
+
+ if version is not None: version = int(version)
+ if expires is not None: expires = int(expires)
+ if port is None and port_specified is True:
+ raise ValueError("if port is None, port_specified must be false")
+
+ self.version = version
+ self.name = name
+ self.value = value
+ self.port = port
+ self.port_specified = port_specified
+ # normalise case, as per RFC 2965 section 3.3.3
+ self.domain = string.lower(domain)
+ self.domain_specified = domain_specified
+ # Sigh. We need to know whether the domain given in the
+ # cookie-attribute had an initial dot, in order to follow RFC 2965
+ # (as clarified in draft errata). Needed for the returned $Domain
+ # value.
+ self.domain_initial_dot = domain_initial_dot
+ self.path = path
+ self.path_specified = path_specified
+ self.secure = secure
+ self.expires = expires
+ self.discard = discard
+ self.comment = comment
+ self.comment_url = comment_url
+ self.rfc2109 = rfc2109
+
+ self._rest = copy.copy(rest)
+
+ def has_nonstandard_attr(self, name):
+ return self._rest.has_key(name)
+ def get_nonstandard_attr(self, name, default=None):
+ return self._rest.get(name, default)
+ def set_nonstandard_attr(self, name, value):
+ self._rest[name] = value
+ def nonstandard_attr_keys(self):
+ return self._rest.keys()
+
+ def is_expired(self, now=None):
+ if now is None: now = time.time()
+ return (self.expires is not None) and (self.expires <= now)
+
+ def __str__(self):
+ if self.port is None: p = ""
+ else: p = ":"+self.port
+ limit = self.domain + p + self.path
+ if self.value is not None:
+ namevalue = "%s=%s" % (self.name, self.value)
+ else:
+ namevalue = self.name
+ return "<Cookie %s for %s>" % (namevalue, limit)
+
+ def __repr__(self):
+ args = []
+ for name in ["version", "name", "value",
+ "port", "port_specified",
+ "domain", "domain_specified", "domain_initial_dot",
+ "path", "path_specified",
+ "secure", "expires", "discard", "comment", "comment_url",
+ ]:
+ attr = getattr(self, name)
+ args.append("%s=%s" % (name, repr(attr)))
+ args.append("rest=%s" % repr(self._rest))
+ args.append("rfc2109=%s" % repr(self.rfc2109))
+ return "Cookie(%s)" % string.join(args, ", ")
+
+
+class CookiePolicy:
+ """Defines which cookies get accepted from and returned to server.
+
+ May also modify cookies.
+
+ The subclass DefaultCookiePolicy defines the standard rules for Netscape
+ and RFC 2965 cookies -- override that if you want a customised policy.
+
+ As well as implementing set_ok and return_ok, implementations of this
+ interface must also supply the following attributes, indicating which
+ protocols should be used, and how. These can be read and set at any time,
+ though whether that makes complete sense from the protocol point of view is
+ doubtful.
+
+ Public attributes:
+
+ netscape: implement netscape protocol
+ rfc2965: implement RFC 2965 protocol
+ rfc2109_as_netscape:
+ WARNING: This argument will change or go away if is not accepted into
+ the Python standard library in this form!
+ If true, treat RFC 2109 cookies as though they were Netscape cookies. The
+ default is for this attribute to be None, which means treat 2109 cookies
+ as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is,
+ by default), and as Netscape cookies otherwise.
+ hide_cookie2: don't add Cookie2 header to requests (the presence of
+ this header indicates to the server that we understand RFC 2965
+ cookies)
+
+ """
+ def set_ok(self, cookie, request):
+ """Return true if (and only if) cookie should be accepted from server.
+
+ Currently, pre-expired cookies never get this far -- the CookieJar
+ class deletes such cookies itself.
+
+ cookie: ClientCookie.Cookie object
+ request: object implementing the interface defined by
+ CookieJar.extract_cookies.__doc__
+
+ """
+ raise NotImplementedError()
+
+ def return_ok(self, cookie, request):
+ """Return true if (and only if) cookie should be returned to server.
+
+ cookie: ClientCookie.Cookie object
+ request: object implementing the interface defined by
+ CookieJar.add_cookie_header.__doc__
+
+ """
+ raise NotImplementedError()
+
+ def domain_return_ok(self, domain, request):
+ """Return false if cookies should not be returned, given cookie domain.
+
+ This is here as an optimization, to remove the need for checking every
+ cookie with a particular domain (which may involve reading many files).
+ The default implementations of domain_return_ok and path_return_ok
+ (return True) leave all the work to return_ok.
+
+ If domain_return_ok returns true for the cookie domain, path_return_ok
+ is called for the cookie path. Otherwise, path_return_ok and return_ok
+ are never called for that cookie domain. If path_return_ok returns
+ true, return_ok is called with the Cookie object itself for a full
+ check. Otherwise, return_ok is never called for that cookie path.
+
+ Note that domain_return_ok is called for every *cookie* domain, not
+ just for the *request* domain. For example, the function might be
+ called with both ".acme.com" and "www.acme.com" if the request domain is
+ "www.acme.com". The same goes for path_return_ok.
+
+ For argument documentation, see the docstring for return_ok.
+
+ """
+ return True
+
+ def path_return_ok(self, path, request):
+ """Return false if cookies should not be returned, given cookie path.
+
+ See the docstring for domain_return_ok.
+
+ """
+ return True
+
+
+class DefaultCookiePolicy(CookiePolicy):
+ """Implements the standard rules for accepting and returning cookies.
+
+ Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is
+ switched off by default.
+
+ The easiest way to provide your own policy is to override this class and
+ call its methods in your overriden implementations before adding your own
+ additional checks.
+
+ import ClientCookie
+ class MyCookiePolicy(ClientCookie.DefaultCookiePolicy):
+ def set_ok(self, cookie, request):
+ if not ClientCookie.DefaultCookiePolicy.set_ok(
+ self, cookie, request):
+ return False
+ if i_dont_want_to_store_this_cookie():
+ return False
+ return True
+
+ In addition to the features required to implement the CookiePolicy
+ interface, this class allows you to block and allow domains from setting
+ and receiving cookies. There are also some strictness switches that allow
+ you to tighten up the rather loose Netscape protocol rules a little bit (at
+ the cost of blocking some benign cookies).
+
+ A domain blacklist and whitelist is provided (both off by default). Only
+ domains not in the blacklist and present in the whitelist (if the whitelist
+ is active) participate in cookie setting and returning. Use the
+ blocked_domains constructor argument, and blocked_domains and
+ set_blocked_domains methods (and the corresponding argument and methods for
+ allowed_domains). If you set a whitelist, you can turn it off again by
+ setting it to None.
+
+ Domains in block or allow lists that do not start with a dot must
+ string-compare equal. For example, "acme.com" matches a blacklist entry of
+ "acme.com", but "www.acme.com" does not. Domains that do start with a dot
+ are matched by more specific domains too. For example, both "www.acme.com"
+ and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does
+ not). IP addresses are an exception, and must match exactly. For example,
+ if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is
+ blocked, but 193.168.1.2 is not.
+
+ Additional Public Attributes:
+
+ General strictness switches
+
+ strict_domain: don't allow sites to set two-component domains with
+ country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc.
+ This is far from perfect and isn't guaranteed to work!
+
+ RFC 2965 protocol strictness switches
+
+ strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable
+ transactions (usually, an unverifiable transaction is one resulting from
+ a redirect or an image hosted on another site); if this is false, cookies
+ are NEVER blocked on the basis of verifiability
+
+ Netscape protocol strictness switches
+
+ strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions
+ even to Netscape cookies
+ strict_ns_domain: flags indicating how strict to be with domain-matching
+ rules for Netscape cookies:
+ DomainStrictNoDots: when setting cookies, host prefix must not contain a
+ dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because
+ www.foo contains a dot)
+ DomainStrictNonDomain: cookies that did not explicitly specify a Domain
+ cookie-attribute can only be returned to a domain that string-compares
+ equal to the domain that set the cookie (eg. rockets.acme.com won't
+ be returned cookies from acme.com that had no Domain cookie-attribute)
+ DomainRFC2965Match: when setting cookies, require a full RFC 2965
+ domain-match
+ DomainLiberal and DomainStrict are the most useful combinations of the
+ above flags, for convenience
+ strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that
+ have names starting with '$'
+ strict_ns_set_path: don't allow setting cookies whose path doesn't
+ path-match request URI
+
+ """
+
+ DomainStrictNoDots = 1
+ DomainStrictNonDomain = 2
+ DomainRFC2965Match = 4
+
+ DomainLiberal = 0
+ DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
+
+ def __init__(self,
+ blocked_domains=None, allowed_domains=None,
+ netscape=True, rfc2965=False,
+ # WARNING: this argument will change or go away if is not
+ # accepted into the Python standard library in this form!
+ # default, ie. treat 2109 as netscape iff not rfc2965
+ rfc2109_as_netscape=None,
+ hide_cookie2=False,
+ strict_domain=False,
+ strict_rfc2965_unverifiable=True,
+ strict_ns_unverifiable=False,
+ strict_ns_domain=DomainLiberal,
+ strict_ns_set_initial_dollar=False,
+ strict_ns_set_path=False,
+ ):
+ """
+ Constructor arguments should be used as keyword arguments only.
+
+ blocked_domains: sequence of domain names that we never accept cookies
+ from, nor return cookies to
+ allowed_domains: if not None, this is a sequence of the only domains
+ for which we accept and return cookies
+
+ For other arguments, see CookiePolicy.__doc__ and
+ DefaultCookiePolicy.__doc__..
+
+ """
+ self.netscape = netscape
+ self.rfc2965 = rfc2965
+ self.rfc2109_as_netscape = rfc2109_as_netscape
+ self.hide_cookie2 = hide_cookie2
+ self.strict_domain = strict_domain
+ self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
+ self.strict_ns_unverifiable = strict_ns_unverifiable
+ self.strict_ns_domain = strict_ns_domain
+ self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
+ self.strict_ns_set_path = strict_ns_set_path
+
+ if blocked_domains is not None:
+ self._blocked_domains = tuple(blocked_domains)
+ else:
+ self._blocked_domains = ()
+
+ if allowed_domains is not None:
+ allowed_domains = tuple(allowed_domains)
+ self._allowed_domains = allowed_domains
+
+ def blocked_domains(self):
+ """Return the sequence of blocked domains (as a tuple)."""
+ return self._blocked_domains
+ def set_blocked_domains(self, blocked_domains):
+ """Set the sequence of blocked domains."""
+ self._blocked_domains = tuple(blocked_domains)
+
+ def is_blocked(self, domain):
+ for blocked_domain in self._blocked_domains:
+ if user_domain_match(domain, blocked_domain):
+ return True
+ return False
+
+ def allowed_domains(self):
+ """Return None, or the sequence of allowed domains (as a tuple)."""
+ return self._allowed_domains
+ def set_allowed_domains(self, allowed_domains):
+ """Set the sequence of allowed domains, or None."""
+ if allowed_domains is not None:
+ allowed_domains = tuple(allowed_domains)
+ self._allowed_domains = allowed_domains
+
+ def is_not_allowed(self, domain):
+ if self._allowed_domains is None:
+ return False
+ for allowed_domain in self._allowed_domains:
+ if user_domain_match(domain, allowed_domain):
+ return False
+ return True
+
+ def set_ok(self, cookie, request):
+ """
+ If you override set_ok, be sure to call this method. If it returns
+ false, so should your subclass (assuming your subclass wants to be more
+ strict about which cookies to accept).
+
+ """
+ debug(" - checking cookie %s", cookie)
+
+ assert cookie.name is not None
+
+ for n in "version", "verifiability", "name", "path", "domain", "port":
+ fn_name = "set_ok_"+n
+ fn = getattr(self, fn_name)
+ if not fn(cookie, request):
+ return False
+
+ return True
+
+ def set_ok_version(self, cookie, request):
+ if cookie.version is None:
+ # Version is always set to 0 by parse_ns_headers if it's a Netscape
+ # cookie, so this must be an invalid RFC 2965 cookie.
+ debug(" Set-Cookie2 without version attribute (%s)", cookie)
+ return False
+ if cookie.version > 0 and not self.rfc2965:
+ debug(" RFC 2965 cookies are switched off")
+ return False
+ elif cookie.version == 0 and not self.netscape:
+ debug(" Netscape cookies are switched off")
+ return False
+ return True
+
+ def set_ok_verifiability(self, cookie, request):
+ if request.unverifiable and is_third_party(request):
+ if cookie.version > 0 and self.strict_rfc2965_unverifiable:
+ debug(" third-party RFC 2965 cookie during "
+ "unverifiable transaction")
+ return False
+ elif cookie.version == 0 and self.strict_ns_unverifiable:
+ debug(" third-party Netscape cookie during "
+ "unverifiable transaction")
+ return False
+ return True
+
+ def set_ok_name(self, cookie, request):
+ # Try and stop servers setting V0 cookies designed to hack other
+ # servers that know both V0 and V1 protocols.
+ if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
+ startswith(cookie.name, "$")):
+ debug(" illegal name (starts with '$'): '%s'", cookie.name)
+ return False
+ return True
+
+ def set_ok_path(self, cookie, request):
+ if cookie.path_specified:
+ req_path = request_path(request)
+ if ((cookie.version > 0 or
+ (cookie.version == 0 and self.strict_ns_set_path)) and
+ not startswith(req_path, cookie.path)):
+ debug(" path attribute %s is not a prefix of request "
+ "path %s", cookie.path, req_path)
+ return False
+ return True
+
+ def set_ok_countrycode_domain(self, cookie, request):
+ """Return False if explicit cookie domain is not acceptable.
+
+ Called by set_ok_domain, for convenience of overriding by
+ subclasses.
+
+ """
+ if cookie.domain_specified and self.strict_domain:
+ domain = cookie.domain
+ # since domain was specified, we know that:
+ assert domain.startswith(".")
+ if string.count(domain, ".") == 2:
+ # domain like .foo.bar
+ i = string.rfind(domain, ".")
+ tld = domain[i+1:]
+ sld = domain[1:i]
+ if (string.lower(sld) in [
+ "co", "ac",
+ "com", "edu", "org", "net", "gov", "mil", "int"] and
+ len(tld) == 2):
+ # domain like .co.uk
+ return False
+ return True
+
+ def set_ok_domain(self, cookie, request):
+ if self.is_blocked(cookie.domain):
+ debug(" domain %s is in user block-list", cookie.domain)
+ return False
+ if self.is_not_allowed(cookie.domain):
+ debug(" domain %s is not in user allow-list", cookie.domain)
+ return False
+ if not self.set_ok_countrycode_domain(cookie, request):
+ debug(" country-code second level domain %s", cookie.domain)
+ return False
+ if cookie.domain_specified:
+ req_host, erhn = eff_request_host(request)
+ domain = cookie.domain
+ if startswith(domain, "."):
+ undotted_domain = domain[1:]
+ else:
+ undotted_domain = domain
+ embedded_dots = (string.find(undotted_domain, ".") >= 0)
+ if not embedded_dots and domain != ".local":
+ debug(" non-local domain %s contains no embedded dot",
+ domain)
+ return False
+ if cookie.version == 0:
+ if (not endswith(erhn, domain) and
+ (not startswith(erhn, ".") and
+ not endswith("."+erhn, domain))):
+ debug(" effective request-host %s (even with added "
+ "initial dot) does not end end with %s",
+ erhn, domain)
+ return False
+ if (cookie.version > 0 or
+ (self.strict_ns_domain & self.DomainRFC2965Match)):
+ if not domain_match(erhn, domain):
+ debug(" effective request-host %s does not domain-match "
+ "%s", erhn, domain)
+ return False
+ if (cookie.version > 0 or
+ (self.strict_ns_domain & self.DomainStrictNoDots)):
+ host_prefix = req_host[:-len(domain)]
+ if (string.find(host_prefix, ".") >= 0 and
+ not IPV4_RE.search(req_host)):
+ debug(" host prefix %s for domain %s contains a dot",
+ host_prefix, domain)
+ return False
+ return True
+
+ def set_ok_port(self, cookie, request):
+ if cookie.port_specified:
+ req_port = request_port(request)
+ if req_port is None:
+ req_port = "80"
+ else:
+ req_port = str(req_port)
+ for p in string.split(cookie.port, ","):
+ try:
+ int(p)
+ except ValueError:
+ debug(" bad port %s (not numeric)", p)
+ return False
+ if p == req_port:
+ break
+ else:
+ debug(" request port (%s) not found in %s",
+ req_port, cookie.port)
+ return False
+ return True
+
+ def return_ok(self, cookie, request):
+ """
+ If you override return_ok, be sure to call this method. If it returns
+ false, so should your subclass (assuming your subclass wants to be more
+ strict about which cookies to return).
+
+ """
+ # Path has already been checked by path_return_ok, and domain blocking
+ # done by domain_return_ok.
+ debug(" - checking cookie %s", cookie)
+
+ for n in "version", "verifiability", "secure", "expires", "port", "domain":
+ fn_name = "return_ok_"+n
+ fn = getattr(self, fn_name)
+ if not fn(cookie, request):
+ return False
+ return True
+
+ def return_ok_version(self, cookie, request):
+ if cookie.version > 0 and not self.rfc2965:
+ debug(" RFC 2965 cookies are switched off")
+ return False
+ elif cookie.version == 0 and not self.netscape:
+ debug(" Netscape cookies are switched off")
+ return False
+ return True
+
+ def return_ok_verifiability(self, cookie, request):
+ if request.unverifiable and is_third_party(request):
+ if cookie.version > 0 and self.strict_rfc2965_unverifiable:
+ debug(" third-party RFC 2965 cookie during unverifiable "
+ "transaction")
+ return False
+ elif cookie.version == 0 and self.strict_ns_unverifiable:
+ debug(" third-party Netscape cookie during unverifiable "
+ "transaction")
+ return False
+ return True
+
+ def return_ok_secure(self, cookie, request):
+ if cookie.secure and request.get_type() != "https":
+ debug(" secure cookie with non-secure request")
+ return False
+ return True
+
+ def return_ok_expires(self, cookie, request):
+ if cookie.is_expired(self._now):
+ debug(" cookie expired")
+ return False
+ return True
+
+ def return_ok_port(self, cookie, request):
+ if cookie.port:
+ req_port = request_port(request)
+ if req_port is None:
+ req_port = "80"
+ for p in string.split(cookie.port, ","):
+ if p == req_port:
+ break
+ else:
+ debug(" request port %s does not match cookie port %s",
+ req_port, cookie.port)
+ return False
+ return True
+
+ def return_ok_domain(self, cookie, request):
+ req_host, erhn = eff_request_host(request)
+ domain = cookie.domain
+
+ # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
+ if (cookie.version == 0 and
+ (self.strict_ns_domain & self.DomainStrictNonDomain) and
+ not cookie.domain_specified and domain != erhn):
+ debug(" cookie with unspecified domain does not string-compare "
+ "equal to request domain")
+ return False
+
+ if cookie.version > 0 and not domain_match(erhn, domain):
+ debug(" effective request-host name %s does not domain-match "
+ "RFC 2965 cookie domain %s", erhn, domain)
+ return False
+ if cookie.version == 0 and not endswith("."+erhn, domain):
+ debug(" request-host %s does not match Netscape cookie domain "
+ "%s", req_host, domain)
+ return False
+ return True
+
+ def domain_return_ok(self, domain, request):
+ # Liberal check of domain. This is here as an optimization to avoid
+ # having to load lots of MSIE cookie files unless necessary.
+
+ # Munge req_host and erhn to always start with a dot, so as to err on
+ # the side of letting cookies through.
+ dotted_req_host, dotted_erhn = eff_request_host(request)
+ if not startswith(dotted_req_host, "."):
+ dotted_req_host = "."+dotted_req_host
+ if not startswith(dotted_erhn, "."):
+ dotted_erhn = "."+dotted_erhn
+ if not (endswith(dotted_req_host, domain) or
+ endswith(dotted_erhn, domain)):
+ #debug(" request domain %s does not match cookie domain %s",
+ # req_host, domain)
+ return False
+
+ if self.is_blocked(domain):
+ debug(" domain %s is in user block-list", domain)
+ return False
+ if self.is_not_allowed(domain):
+ debug(" domain %s is not in user allow-list", domain)
+ return False
+
+ return True
+
+ def path_return_ok(self, path, request):
+ debug("- checking cookie path=%s", path)
+ req_path = request_path(request)
+ if not startswith(req_path, path):
+ debug(" %s does not path-match %s", req_path, path)
+ return False
+ return True
+
+
+def vals_sorted_by_key(adict):
+ keys = adict.keys()
+ keys.sort()
+ return map(adict.get, keys)
+
+class MappingIterator:
+ """Iterates over nested mapping, depth-first, in sorted order by key."""
+ def __init__(self, mapping):
+ self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack
+
+ def __iter__(self): return self
+
+ def next(self):
+ # this is hairy because of lack of generators
+ while 1:
+ try:
+ vals, i, prev_item = self._s.pop()
+ except IndexError:
+ raise StopIteration()
+ if i < len(vals):
+ item = vals[i]
+ i = i + 1
+ self._s.append((vals, i, prev_item))
+ try:
+ item.items
+ except AttributeError:
+ # non-mapping
+ break
+ else:
+ # mapping
+ self._s.append((vals_sorted_by_key(item), 0, item))
+ continue
+ return item
+
+
+# Used as second parameter to dict.get method, to distinguish absent
+# dict key from one with a None value.
+class Absent: pass
+
+class CookieJar:
+ """Collection of HTTP cookies.
+
+ You may not need to know about this class: try ClientCookie.urlopen().
+
+ The major methods are extract_cookies and add_cookie_header; these are all
+ you are likely to need.
+
+ CookieJar supports the iterator protocol:
+
+ for cookie in cookiejar:
+ # do something with cookie
+
+ Methods:
+
+ add_cookie_header(request)
+ extract_cookies(response, request)
+ make_cookies(response, request)
+ set_cookie_if_ok(cookie, request)
+ set_cookie(cookie)
+ clear_session_cookies()
+ clear_expired_cookies()
+ clear(domain=None, path=None, name=None)
+
+ Public attributes
+
+ policy: CookiePolicy object
+
+ """
+
+ non_word_re = re.compile(r"\W")
+ quote_re = re.compile(r"([\"\\])")
+ strict_domain_re = re.compile(r"\.?[^.]*")
+ domain_re = re.compile(r"[^.]*")
+ dots_re = re.compile(r"^\.+")
+
+ def __init__(self, policy=None):
+ """
+ See CookieJar.__doc__ for argument documentation.
+
+ """
+ if policy is None:
+ policy = DefaultCookiePolicy()
+ self._policy = policy
+
+ self._cookies = {}
+
+ # for __getitem__ iteration in pre-2.2 Pythons
+ self._prev_getitem_index = 0
+
+ def set_policy(self, policy):
+ self._policy = policy
+
+ def _cookies_for_domain(self, domain, request):
+ cookies = []
+ if not self._policy.domain_return_ok(domain, request):
+ return []
+ debug("Checking %s for cookies to return", domain)
+ cookies_by_path = self._cookies[domain]
+ for path in cookies_by_path.keys():
+ if not self._policy.path_return_ok(path, request):
+ continue
+ cookies_by_name = cookies_by_path[path]
+ for cookie in cookies_by_name.values():
+ if not self._policy.return_ok(cookie, request):
+ debug(" not returning cookie")
+ continue
+ debug(" it's a match")
+ cookies.append(cookie)
+ return cookies
+
+ def _cookies_for_request(self, request):
+ """Return a list of cookies to be returned to server."""
+ cookies = []
+ for domain in self._cookies.keys():
+ cookies.extend(self._cookies_for_domain(domain, request))
+ return cookies
+
+ def _cookie_attrs(self, cookies):
+ """Return a list of cookie-attributes to be returned to server.
+
+ like ['foo="bar"; $Path="/"', ...]
+
+ The $Version attribute is also added when appropriate (currently only
+ once per request).
+
+ """
+ # add cookies in order of most specific (ie. longest) path first
+ def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
+ cookies.sort(decreasing_size)
+
+ version_set = False
+
+ attrs = []
+ for cookie in cookies:
+ # set version of Cookie header
+ # XXX
+ # What should it be if multiple matching Set-Cookie headers have
+ # different versions themselves?
+ # Answer: there is no answer; was supposed to be settled by
+ # RFC 2965 errata, but that may never appear...
+ version = cookie.version
+ if not version_set:
+ version_set = True
+ if version > 0:
+ attrs.append("$Version=%s" % version)
+
+ # quote cookie value if necessary
+ # (not for Netscape protocol, which already has any quotes
+ # intact, due to the poorly-specified Netscape Cookie: syntax)
+ if ((cookie.value is not None) and
+ self.non_word_re.search(cookie.value) and version > 0):
+ value = self.quote_re.sub(r"\\\1", cookie.value)
+ else:
+ value = cookie.value
+
+ # add cookie-attributes to be returned in Cookie header
+ if cookie.value is None:
+ attrs.append(cookie.name)
+ else:
+ attrs.append("%s=%s" % (cookie.name, value))
+ if version > 0:
+ if cookie.path_specified:
+ attrs.append('$Path="%s"' % cookie.path)
+ if startswith(cookie.domain, "."):
+ domain = cookie.domain
+ if (not cookie.domain_initial_dot and
+ startswith(domain, ".")):
+ domain = domain[1:]
+ attrs.append('$Domain="%s"' % domain)
+ if cookie.port is not None:
+ p = "$Port"
+ if cookie.port_specified:
+ p = p + ('="%s"' % cookie.port)
+ attrs.append(p)
+
+ return attrs
+
+ def add_cookie_header(self, request):
+ """Add correct Cookie: header to request (urllib2.Request object).
+
+ The Cookie2 header is also added unless policy.hide_cookie2 is true.
+
+ The request object (usually a urllib2.Request instance) must support
+ the methods get_full_url, get_host, get_type, has_header, get_header,
+ header_items and add_unredirected_header, as documented by urllib2, and
+ the port attribute (the port number). Actually,
+ RequestUpgradeProcessor will automatically upgrade your Request object
+ to one with has_header, get_header, header_items and
+ add_unredirected_header, if it lacks those methods, for compatibility
+ with pre-2.4 versions of urllib2.
+
+ """
+ debug("add_cookie_header")
+ self._policy._now = self._now = int(time.time())
+
+ req_host, erhn = eff_request_host(request)
+ strict_non_domain = (
+ self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
+
+ cookies = self._cookies_for_request(request)
+
+ attrs = self._cookie_attrs(cookies)
+ if attrs:
+ if not request.has_header("Cookie"):
+ request.add_unredirected_header(
+ "Cookie", string.join(attrs, "; "))
+
+ # if necessary, advertise that we know RFC 2965
+ if self._policy.rfc2965 and not self._policy.hide_cookie2:
+ for cookie in cookies:
+ if cookie.version != 1 and not request.has_header("Cookie2"):
+ request.add_unredirected_header("Cookie2", '$Version="1"')
+ break
+
+ self.clear_expired_cookies()
+
+ def _normalized_cookie_tuples(self, attrs_set):
+ """Return list of tuples containing normalised cookie information.
+
+ attrs_set is the list of lists of key,value pairs extracted from
+ the Set-Cookie or Set-Cookie2 headers.
+
+ Tuples are name, value, standard, rest, where name and value are the
+ cookie name and value, standard is a dictionary containing the standard
+ cookie-attributes (discard, secure, version, expires or max-age,
+ domain, path and port) and rest is a dictionary containing the rest of
+ the cookie-attributes.
+
+ """
+ cookie_tuples = []
+
+ boolean_attrs = "discard", "secure"
+ value_attrs = ("version",
+ "expires", "max-age",
+ "domain", "path", "port",
+ "comment", "commenturl")
+
+ for cookie_attrs in attrs_set:
+ name, value = cookie_attrs[0]
+
+ # Build dictionary of standard cookie-attributes (standard) and
+ # dictionary of other cookie-attributes (rest).
+
+ # Note: expiry time is normalised to seconds since epoch. V0
+ # cookies should have the Expires cookie-attribute, and V1 cookies
+ # should have Max-Age, but since V1 includes RFC 2109 cookies (and
+ # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
+ # accept either (but prefer Max-Age).
+ max_age_set = False
+
+ bad_cookie = False
+
+ standard = {}
+ rest = {}
+ for k, v in cookie_attrs[1:]:
+ lc = string.lower(k)
+ # don't lose case distinction for unknown fields
+ if lc in value_attrs or lc in boolean_attrs:
+ k = lc
+ if k in boolean_attrs and v is None:
+ # boolean cookie-attribute is present, but has no value
+ # (like "discard", rather than "port=80")
+ v = True
+ if standard.has_key(k):
+ # only first value is significant
+ continue
+ if k == "domain":
+ if v is None:
+ debug(" missing value for domain attribute")
+ bad_cookie = True
+ break
+ # RFC 2965 section 3.3.3
+ v = string.lower(v)
+ if k == "expires":
+ if max_age_set:
+ # Prefer max-age to expires (like Mozilla)
+ continue
+ if v is None:
+ debug(" missing or invalid value for expires "
+ "attribute: treating as session cookie")
+ continue
+ if k == "max-age":
+ max_age_set = True
+ try:
+ v = int(v)
+ except ValueError:
+ debug(" missing or invalid (non-numeric) value for "
+ "max-age attribute")
+ bad_cookie = True
+ break
+ # convert RFC 2965 Max-Age to seconds since epoch
+ # XXX Strictly you're supposed to follow RFC 2616
+ # age-calculation rules. Remember that zero Max-Age is a
+ # is a request to discard (old and new) cookie, though.
+ k = "expires"
+ v = self._now + v
+ if (k in value_attrs) or (k in boolean_attrs):
+ if (v is None and
+ k not in ["port", "comment", "commenturl"]):
+ debug(" missing value for %s attribute" % k)
+ bad_cookie = True
+ break
+ standard[k] = v
+ else:
+ rest[k] = v
+
+ if bad_cookie:
+ continue
+
+ cookie_tuples.append((name, value, standard, rest))
+
+ return cookie_tuples
+
+ def _cookie_from_cookie_tuple(self, tup, request):
+ # standard is dict of standard cookie-attributes, rest is dict of the
+ # rest of them
+ name, value, standard, rest = tup
+
+ domain = standard.get("domain", Absent)
+ path = standard.get("path", Absent)
+ port = standard.get("port", Absent)
+ expires = standard.get("expires", Absent)
+
+ # set the easy defaults
+ version = standard.get("version", None)
+ if version is not None: version = int(version)
+ secure = standard.get("secure", False)
+ # (discard is also set if expires is Absent)
+ discard = standard.get("discard", False)
+ comment = standard.get("comment", None)
+ comment_url = standard.get("commenturl", None)
+
+ # set default path
+ if path is not Absent and path != "":
+ path_specified = True
+ path = escape_path(path)
+ else:
+ path_specified = False
+ path = request_path(request)
+ i = string.rfind(path, "/")
+ if i != -1:
+ if version == 0:
+ # Netscape spec parts company from reality here
+ path = path[:i]
+ else:
+ path = path[:i+1]
+ if len(path) == 0: path = "/"
+
+ # set default domain
+ domain_specified = domain is not Absent
+ # but first we have to remember whether it starts with a dot
+ domain_initial_dot = False
+ if domain_specified:
+ domain_initial_dot = bool(startswith(domain, "."))
+ if domain is Absent:
+ req_host, erhn = eff_request_host(request)
+ domain = erhn
+ elif not startswith(domain, "."):
+ domain = "."+domain
+
+ # set default port
+ port_specified = False
+ if port is not Absent:
+ if port is None:
+ # Port attr present, but has no value: default to request port.
+ # Cookie should then only be sent back on that port.
+ port = request_port(request)
+ else:
+ port_specified = True
+ port = re.sub(r"\s+", "", port)
+ else:
+ # No port attr present. Cookie can be sent back on any port.
+ port = None
+
+ # set default expires and discard
+ if expires is Absent:
+ expires = None
+ discard = True
+ elif expires <= self._now:
+ # Expiry date in past is request to delete cookie. This can't be
+ # in DefaultCookiePolicy, because can't delete cookies there.
+ try:
+ self.clear(domain, path, name)
+ except KeyError:
+ pass
+ debug("Expiring cookie, domain='%s', path='%s', name='%s'",
+ domain, path, name)
+ return None
+
+ return Cookie(version,
+ name, value,
+ port, port_specified,
+ domain, domain_specified, domain_initial_dot,
+ path, path_specified,
+ secure,
+ expires,
+ discard,
+ comment,
+ comment_url,
+ rest)
+
+ def _cookies_from_attrs_set(self, attrs_set, request):
+ cookie_tuples = self._normalized_cookie_tuples(attrs_set)
+
+ cookies = []
+ for tup in cookie_tuples:
+ cookie = self._cookie_from_cookie_tuple(tup, request)
+ if cookie: cookies.append(cookie)
+ return cookies
+
+ def _process_rfc2109_cookies(self, cookies):
+ if self._policy.rfc2109_as_netscape is None:
+ rfc2109_as_netscape = not self._policy.rfc2965
+ else:
+ rfc2109_as_netscape = self._policy.rfc2109_as_netscape
+ for cookie in cookies:
+ if cookie.version == 1:
+ cookie.rfc2109 = True
+ if rfc2109_as_netscape:
+ # treat 2109 cookies as Netscape cookies rather than
+ # as RFC2965 cookies
+ cookie.version = 0
+
+ def make_cookies(self, response, request):
+ """Return sequence of Cookie objects extracted from response object.
+
+ See extract_cookies.__doc__ for the interfaces required of the
+ response and request arguments.
+
+ """
+ # get cookie-attributes for RFC 2965 and Netscape protocols
+ headers = response.info()
+ rfc2965_hdrs = getheaders(headers, "Set-Cookie2")
+ ns_hdrs = getheaders(headers, "Set-Cookie")
+
+ rfc2965 = self._policy.rfc2965
+ netscape = self._policy.netscape
+
+ if ((not rfc2965_hdrs and not ns_hdrs) or
+ (not ns_hdrs and not rfc2965) or
+ (not rfc2965_hdrs and not netscape) or
+ (not netscape and not rfc2965)):
+ return [] # no relevant cookie headers: quick exit
+
+ try:
+ cookies = self._cookies_from_attrs_set(
+ split_header_words(rfc2965_hdrs), request)
+ except:
+ reraise_unmasked_exceptions()
+ cookies = []
+
+ if ns_hdrs and netscape:
+ try:
+ # RFC 2109 and Netscape cookies
+ ns_cookies = self._cookies_from_attrs_set(
+ parse_ns_headers(ns_hdrs), request)
+ except:
+ reraise_unmasked_exceptions()
+ ns_cookies = []
+ self._process_rfc2109_cookies(ns_cookies)
+
+ # Look for Netscape cookies (from Set-Cookie headers) that match
+ # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
+ # For each match, keep the RFC 2965 cookie and ignore the Netscape
+ # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
+ # bundled in with the Netscape cookies for this purpose, which is
+ # reasonable behaviour.
+ if rfc2965:
+ lookup = {}
+ for cookie in cookies:
+ lookup[(cookie.domain, cookie.path, cookie.name)] = None
+
+ def no_matching_rfc2965(ns_cookie, lookup=lookup):
+ key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
+ return not lookup.has_key(key)
+ ns_cookies = filter(no_matching_rfc2965, ns_cookies)
+
+ if ns_cookies:
+ cookies.extend(ns_cookies)
+
+ return cookies
+
+ def set_cookie_if_ok(self, cookie, request):
+ """Set a cookie if policy says it's OK to do so.
+
+ cookie: ClientCookie.Cookie instance
+ request: see extract_cookies.__doc__ for the required interface
+
+ """
+ self._policy._now = self._now = int(time.time())
+
+ if self._policy.set_ok(cookie, request):
+ self.set_cookie(cookie)
+
+ def set_cookie(self, cookie):
+ """Set a cookie, without checking whether or not it should be set.
+
+ cookie: ClientCookie.Cookie instance
+ """
+ c = self._cookies
+ if not c.has_key(cookie.domain): c[cookie.domain] = {}
+ c2 = c[cookie.domain]
+ if not c2.has_key(cookie.path): c2[cookie.path] = {}
+ c3 = c2[cookie.path]
+ c3[cookie.name] = cookie
+
+ def extract_cookies(self, response, request):
+ """Extract cookies from response, where allowable given the request.
+
+ Look for allowable Set-Cookie: and Set-Cookie2: headers in the response
+ object passed as argument. Any of these headers that are found are
+ used to update the state of the object (subject to the policy.set_ok
+ method's approval).
+
+ The response object (usually be the result of a call to
+ ClientCookie.urlopen, or similar) should support an info method, which
+ returns a mimetools.Message object (in fact, the 'mimetools.Message
+ object' may be any object that provides a getallmatchingheaders
+ method).
+
+ The request object (usually a urllib2.Request instance) must support
+ the methods get_full_url and get_host, as documented by urllib2, and
+ the port attribute (the port number). The request is used to set
+ default values for cookie-attributes as well as for checking that the
+ cookie is OK to be set.
+
+ """
+ debug("extract_cookies: %s", response.info())
+ self._policy._now = self._now = int(time.time())
+
+ for cookie in self.make_cookies(response, request):
+ if self._policy.set_ok(cookie, request):
+ debug(" setting cookie: %s", cookie)
+ self.set_cookie(cookie)
+
+ def clear(self, domain=None, path=None, name=None):
+ """Clear some cookies.
+
+ Invoking this method without arguments will clear all cookies. If
+ given a single argument, only cookies belonging to that domain will be
+ removed. If given two arguments, cookies belonging to the specified
+ path within that domain are removed. If given three arguments, then
+ the cookie with the specified name, path and domain is removed.
+
+ Raises KeyError if no matching cookie exists.
+
+ """
+ if name is not None:
+ if (domain is None) or (path is None):
+ raise ValueError(
+ "domain and path must be given to remove a cookie by name")
+ del self._cookies[domain][path][name]
+ elif path is not None:
+ if domain is None:
+ raise ValueError(
+ "domain must be given to remove cookies by path")
+ del self._cookies[domain][path]
+ elif domain is not None:
+ del self._cookies[domain]
+ else:
+ self._cookies = {}
+
+ def clear_session_cookies(self):
+ """Discard all session cookies.
+
+ Discards all cookies held by object which had either no Max-Age or
+ Expires cookie-attribute or an explicit Discard cookie-attribute, or
+ which otherwise have ended up with a true discard attribute. For
+ interactive browsers, the end of a session usually corresponds to
+ closing the browser window.
+
+ Note that the save method won't save session cookies anyway, unless you
+ ask otherwise by passing a true ignore_discard argument.
+
+ """
+ for cookie in self:
+ if cookie.discard:
+ self.clear(cookie.domain, cookie.path, cookie.name)
+
+ def clear_expired_cookies(self):
+ """Discard all expired cookies.
+
+ You probably don't need to call this method: expired cookies are never
+ sent back to the server (provided you're using DefaultCookiePolicy),
+ this method is called by CookieJar itself every so often, and the save
+ method won't save expired cookies anyway (unless you ask otherwise by
+ passing a true ignore_expires argument).
+
+ """
+ now = time.time()
+ for cookie in self:
+ if cookie.is_expired(now):
+ self.clear(cookie.domain, cookie.path, cookie.name)
+
+ def __getitem__(self, i):
+ if i == 0:
+ self._getitem_iterator = self.__iter__()
+ elif self._prev_getitem_index != i-1: raise IndexError(
+ "CookieJar.__getitem__ only supports sequential iteration")
+ self._prev_getitem_index = i
+ try:
+ return self._getitem_iterator.next()
+ except StopIteration:
+ raise IndexError()
+
+ def __iter__(self):
+ return MappingIterator(self._cookies)
+
+ def __len__(self):
+ """Return number of contained cookies."""
+ i = 0
+ for cookie in self: i = i + 1
+ return i
+
+ def __repr__(self):
+ r = []
+ for cookie in self: r.append(repr(cookie))
+ return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+
+ def __str__(self):
+ r = []
+ for cookie in self: r.append(str(cookie))
+ return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+
+
+class LoadError(Exception): pass
+
+class FileCookieJar(CookieJar):
+ """CookieJar that can be loaded from and saved to a file.
+
+ Additional methods
+
+ save(filename=None, ignore_discard=False, ignore_expires=False)
+ load(filename=None, ignore_discard=False, ignore_expires=False)
+ revert(filename=None, ignore_discard=False, ignore_expires=False)
+
+ Additional public attributes
+
+ filename: filename for loading and saving cookies
+
+ Additional public readable attributes
+
+ delayload: request that cookies are lazily loaded from disk; this is only
+ a hint since this only affects performance, not behaviour (unless the
+ cookies on disk are changing); a CookieJar object may ignore it (in fact,
+ only MSIECookieJar lazily loads cookies at the moment)
+
+ """
+
+ def __init__(self, filename=None, delayload=False, policy=None):
+ """
+ See FileCookieJar.__doc__ for argument documentation.
+
+ Cookies are NOT loaded from the named file until either the load or
+ revert method is called.
+
+ """
+ CookieJar.__init__(self, policy)
+ if filename is not None and not isstringlike(filename):
+ raise ValueError("filename must be string-like")
+ self.filename = filename
+ self.delayload = bool(delayload)
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ """Save cookies to a file.
+
+ filename: name of file in which to save cookies
+ ignore_discard: save even cookies set to be discarded
+ ignore_expires: save even cookies that have expired
+
+ The file is overwritten if it already exists, thus wiping all its
+ cookies. Saved cookies can be restored later using the load or revert
+ methods. If filename is not specified, self.filename is used; if
+ self.filename is None, ValueError is raised.
+
+ """
+ raise NotImplementedError()
+
+ def load(self, filename=None, ignore_discard=False, ignore_expires=False):
+ """Load cookies from a file.
+
+ Old cookies are kept unless overwritten by newly loaded ones.
+
+ Arguments are as for .save().
+
+ If filename is not specified, self.filename is used; if self.filename
+ is None, ValueError is raised. The named file must be in the format
+ understood by the class, or LoadError will be raised. This format will
+ be identical to that written by the save method, unless the load format
+ is not sufficiently well understood (as is the case for MSIECookieJar).
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename)
+ try:
+ self._really_load(f, filename, ignore_discard, ignore_expires)
+ finally:
+ f.close()
+
+ def revert(self, filename=None,
+ ignore_discard=False, ignore_expires=False):
+ """Clear all cookies and reload cookies from a saved file.
+
+ Raises LoadError (or IOError) if reversion is not successful; the
+ object's state will not be altered if this happens.
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ old_state = copy.deepcopy(self._cookies)
+ self._cookies = {}
+ try:
+ self.load(filename, ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ self._cookies = old_state
+ raise
Added: Zope3/trunk/src/ClientCookie/_ConnCache.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_ConnCache.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_ConnCache.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,239 @@
+"""Generic connection cache manager.
+
+Example:
+
+ from ClientCookie import ConnectionCache
+ cache = ConnectionCache()
+ cache.deposit("http", "example.com", conn)
+ conn = cache.withdraw("http", "example.com")
+
+
+The ConnectionCache class provides cache expiration.
+
+
+Copyright (C) 2004 John J Lee <jjl at pobox.com>.
+Copyright (C) 2001 Gisle Aas.
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+# Ported from libwww-perl 5.75.
+
+import time
+try:
+ from types import StringTypes
+except ImportError:
+ from types import StringType
+ StringTypes = StringType
+
+from _Util import compat_isinstance
+from _Debug import getLogger
+debug = getLogger("ClientCookie").debug
+
+class _ConnectionRecord:
+ def __init__(self, conn, scheme, key, time):
+ self.conn, self.scheme, self.key, self.time = conn, scheme, key, time
+ def __repr__(self):
+ return "%s(%s, %s, %s, %s)" % (
+ self.__class__.__name__,
+ self.conn, self.scheme, self.key, self.time)
+
+class ConnectionCache:
+ """
+ For specialized cache policy it makes sense to subclass ConnectionCache and
+ perhaps override the .deposit(), ._enforce_limits() and ._dropping()
+ methods.
+
+ """
+ def __init__(self, total_capacity=1):
+ self._limit = {}
+ self.total_capacity(total_capacity)
+
+ def set_total_capacity(self, nr_connections):
+ """Set limit for number of cached connections.
+
+ Connections will start to be dropped when this limit is reached. If 0,
+ all connections are immediately dropped. None means no limit.
+
+ """
+ self._limit_total = nr_connections
+ self._enforce_limits()
+
+ def total_capacity(self):
+ """Return limit for number of cached connections."""
+ return self._limit_total
+
+ def set_capacity(self, scheme, nr_connections):
+ """Set limit for number of cached connections of specifed scheme.
+
+ scheme: URL scheme (eg. "http" or "ftp")
+
+ """
+ self._limit[scheme] = nr_connections
+ self._enforce_limits(scheme)
+
+ def capacity(self, scheme):
+ """Return limit for number of cached connections of specifed scheme.
+
+ scheme: URL scheme (eg. "http" or "ftp")
+
+ """
+ return self._limit[scheme]
+
+ def drop(self, checker=None, reason=None):
+ """Drop connections by some criteria.
+
+ checker: either a callable, a number, a string, or None:
+ If callable: called for each connection with arguments (conn, scheme,
+ key, deposit_time); if it returns a true value, the connection is
+ dropped (default is to drop all connections).
+ If a number: all connections untouched for the given number of seconds
+ or more are dropped.
+ If a string: all connections of the given scheme are dropped.
+ If None: all connections are dropped.
+ reason: passed on to the dropped() method
+
+ """
+ if not callable(checker):
+ if checker is None:
+ checker = lambda cr: True # drop all of them
+ elif compat_isinstance(checker, StringTypes):
+ scheme = checker
+ if reason is None:
+ reason = "drop %s" % scheme
+ checker = lambda cr, scheme=scheme: cr.scheme == scheme
+ else: # numeric
+ age_limit = checker
+ time_limit = time.time() - age_limit
+ if reason is None:
+ reason = "older than %s" % age_limit
+ checker = lambda cr, time_limit=time_limit: cr.time < time_limit
+ if reason is None:
+ reason = "drop"
+
+## local $SIG{__DIE__}; # don't interfere with eval below
+## local $@;
+ crs = []
+ for cr in self._conns:
+ if checker(cr):
+ self._dropping(cr, reason)
+ drop = drop + 1
+ if not drop:
+ crs.append(cr)
+ self._conns = crs
+
+ def prune(self):
+ """Drop all dead connections.
+
+ This is tested by calling the .ping() method on the connections. If
+ the .ping() method exists and returns a false value, then the
+ connection is dropped.
+
+ """
+ # XXX HTTPConnection doesn't have a .ping() method
+ #self.drop(lambda cr: not cr.conn.ping(), "ping")
+ pass
+
+ def get_schemes(self):
+ """Return list of cached connection URL schemes."""
+ t = {}
+ for cr in self._conns:
+ t[cr.scheme] = None
+ return t.keys()
+
+ def get_connections(self, scheme=None):
+ """Return list of all connection objects with the specified URL scheme.
+
+ If no scheme is specified then all connections are returned.
+
+ """
+ cs = []
+ for cr in self._conns:
+ if scheme is None or (scheme and scheme == cr.scheme):
+ c.append(cr.conn)
+ return cs
+
+# -------------------------------------------------------------------------
+# Methods called by handlers to try to save away connections and get them
+# back again.
+
+ def deposit(self, scheme, key, conn):
+ """Add a new connection to the cache.
+
+ scheme: URL scheme (eg. "http")
+ key: any object that can act as a dict key (usually a string or a
+ tuple)
+
+ As a side effect, other already cached connections may be dropped.
+ Multiple connections with the same scheme/key might be added.
+
+ """
+ self._conns.append(_ConnectionRecord(conn, scheme, key, time.time()))
+ self._enforce_limits(scheme)
+
+ def withdraw(self, scheme, key):
+ """Try to fetch back a connection that was previously deposited.
+
+ If no cached connection with the specified scheme/key is found, then
+ None is returned. There is no guarantee that a deposited connection
+ can be withdrawn, as the cache manger is free to drop connections at
+ any time.
+
+ """
+ conns = self._conns
+ for i in range(len(conns)):
+ cr = conns[i]
+ if not (cr.scheme == scheme and cr.key == key):
+ continue
+ conns.pop(i) # remove it
+ return cr.conn
+ return None
+
+# -------------------------------------------------------------------------
+# Called internally. Subclasses might want to override these.
+
+ def _enforce_limits(self, scheme=None):
+ """Drop some cached connections, if necessary.
+
+ Called after a new connection is added (deposited) in the cache or
+ capacity limits are adjusted.
+
+ The default implementation drops connections until the specified
+ capacity limits are not exceeded.
+
+ """
+ conns = self._conns
+ if scheme:
+ schemes = [scheme]
+ else:
+ schemes = self.get_schemes()
+ for scheme in schemes:
+ limit = self._limit.get(scheme)
+ if limit is None:
+ continue
+ for i in range(len(conns), 0, -1):
+ if conns[i].scheme != scheme:
+ continue
+ limit = limit - 1
+ if limit < 0:
+ self._dropping(
+ conns.pop(i),
+ "connection cache %s capacity exceeded" % scheme)
+
+ total = self._limit_total
+ if total is not None:
+ while len(conns) > total:
+ self._dropping(conns.pop(0),
+ "connection cache total capacity exceeded")
+
+ def _dropping(self, conn_record, reason):
+ """Called when a connection is dropped.
+
+ conn_record: _ConnectionRecord instance for the dropped connection
+ reason: string describing the reason for the drop
+
+ """
+ debug("DROPPING %s [%s]" % (conn_record, reason))
Added: Zope3/trunk/src/ClientCookie/_Debug.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_Debug.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_Debug.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,49 @@
+import sys
+
+import ClientCookie
+
+try:
+ import warnings
+except ImportError:
+ def warn(text):
+ ClientCookie.WARNINGS_STREAM.write("WARNING: "+text)
+else:
+ def warn(text):
+ warnings.warn(text, stacklevel=2)
+
+try:
+ import logging
+except:
+ NOTSET = None
+ INFO = 20
+ DEBUG = 10
+ class NullHandler:
+ def write(self, data): pass
+ class Logger:
+ def __init__(self):
+ self.level = NOTSET
+ self.handler = NullHandler()
+ def log(self, level, text, *args):
+ if args:
+ text = text % args
+ if self.level is not None and level <= self.level:
+ self.handler.write(text+"\n")
+ def debug(self, text, *args):
+ apply(self.log, (DEBUG, text)+args)
+ def info(self, text, *args):
+ apply(self.log, (INFO, text)+args)
+ def setLevel(self, lvl):
+ self.level = lvl
+ def addHandler(self, handler):
+ self.handler = handler
+ LOGGER = Logger()
+ def getLogger(name): return LOGGER
+ class StreamHandler:
+ def __init__(self, strm=None):
+ if not strm:
+ strm = sys.stderr
+ self.stream = strm
+ def write(self, data):
+ self.stream.write(data)
+else:
+ from logging import getLogger, StreamHandler, INFO, DEBUG, NOTSET
Added: Zope3/trunk/src/ClientCookie/_HeadersUtil.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_HeadersUtil.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_HeadersUtil.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,227 @@
+"""Utility functions for HTTP header value parsing and construction.
+
+Copyright 1997-1998, Gisle Aas
+Copyright 2002-2004, John J. Lee
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import re, string
+from types import StringType
+try:
+ from types import UnicodeType
+ STRING_TYPES = StringType, UnicodeType
+except:
+ STRING_TYPES = StringType,
+
+from _Util import startswith, endswith, http2time
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+def is_html(ct_headers, url):
+ """
+ ct_headers: Sequence of Content-Type headers
+ url: Response URL
+
+ """
+ if not ct_headers:
+ # guess
+ return (url.endswith('.htm') or url.endswith('.html') or
+ url.endswith('.xhtml'))
+ # use first header
+ ct = split_header_words(ct_headers)[0][0][0]
+ return ct in [
+ "text/html", "text/xhtml", "text/xml",
+ "application/xml", "application/xhtml+xml",
+ ]
+
+def unmatched(match):
+ """Return unmatched part of re.Match object."""
+ start, end = match.span(0)
+ return match.string[:start]+match.string[end:]
+
+token_re = re.compile(r"^\s*([^=\s;,]+)")
+quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
+value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
+escape_re = re.compile(r"\\(.)")
+def split_header_words(header_values):
+ r"""Parse header values into a list of lists containing key,value pairs.
+
+ The function knows how to deal with ",", ";" and "=" as well as quoted
+ values after "=". A list of space separated tokens are parsed as if they
+ were separated by ";".
+
+ If the header_values passed as argument contains multiple values, then they
+ are treated as if they were a single value separated by comma ",".
+
+ This means that this function is useful for parsing header fields that
+ follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
+ the requirement for tokens).
+
+ headers = #header
+ header = (token | parameter) *( [";"] (token | parameter))
+
+ token = 1*<any CHAR except CTLs or separators>
+ separators = "(" | ")" | "<" | ">" | "@"
+ | "," | ";" | ":" | "\" | <">
+ | "/" | "[" | "]" | "?" | "="
+ | "{" | "}" | SP | HT
+
+ quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
+ qdtext = <any TEXT except <">>
+ quoted-pair = "\" CHAR
+
+ parameter = attribute "=" value
+ attribute = token
+ value = token | quoted-string
+
+ Each header is represented by a list of key/value pairs. The value for a
+ simple token (not part of a parameter) is None. Syntactically incorrect
+ headers will not necessarily be parsed as you would want.
+
+ This is easier to describe with some examples:
+
+ >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
+ [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
+ >>> split_header_words(['text/html; charset="iso-8859-1"'])
+ [[('text/html', None), ('charset', 'iso-8859-1')]]
+ >>> split_header_words([r'Basic realm="\"foo\bar\""'])
+ [[('Basic', None), ('realm', '"foobar"')]]
+
+ """
+ assert type(header_values) not in STRING_TYPES
+ result = []
+ for text in header_values:
+ orig_text = text
+ pairs = []
+ while text:
+ m = token_re.search(text)
+ if m:
+ text = unmatched(m)
+ name = m.group(1)
+ m = quoted_value_re.search(text)
+ if m: # quoted value
+ text = unmatched(m)
+ value = m.group(1)
+ value = escape_re.sub(r"\1", value)
+ else:
+ m = value_re.search(text)
+ if m: # unquoted value
+ text = unmatched(m)
+ value = m.group(1)
+ value = string.rstrip(value)
+ else:
+ # no value, a lone token
+ value = None
+ pairs.append((name, value))
+ elif startswith(string.lstrip(text), ","):
+ # concatenated headers, as per RFC 2616 section 4.2
+ text = string.lstrip(text)[1:]
+ if pairs: result.append(pairs)
+ pairs = []
+ else:
+ # skip junk
+ non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
+ assert nr_junk_chars > 0, (
+ "split_header_words bug: '%s', '%s', %s" %
+ (orig_text, text, pairs))
+ text = non_junk
+ if pairs: result.append(pairs)
+ return result
+
+join_escape_re = re.compile(r"([\"\\])")
+def join_header_words(lists):
+ """Do the inverse of the conversion done by split_header_words.
+
+ Takes a list of lists of (key, value) pairs and produces a single header
+ value. Attribute values are quoted if needed.
+
+ >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
+ 'text/plain; charset="iso-8859/1"'
+ >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
+ 'text/plain, charset="iso-8859/1"'
+
+ """
+ headers = []
+ for pairs in lists:
+ attr = []
+ for k, v in pairs:
+ if v is not None:
+ if not re.search(r"^\w+$", v):
+ v = join_escape_re.sub(r"\\\1", v) # escape " and \
+ v = '"%s"' % v
+ if k is None: # Netscape cookies may have no name
+ k = v
+ else:
+ k = "%s=%s" % (k, v)
+ attr.append(k)
+ if attr: headers.append(string.join(attr, "; "))
+ return string.join(headers, ", ")
+
+def parse_ns_headers(ns_headers):
+ """Ad-hoc parser for Netscape protocol cookie-attributes.
+
+ The old Netscape cookie format for Set-Cookie can for instance contain
+ an unquoted "," in the expires field, so we have to use this ad-hoc
+ parser instead of split_header_words.
+
+ XXX This may not make the best possible effort to parse all the crap
+ that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
+ parser is probably better, so could do worse than following that if
+ this ever gives any trouble.
+
+ Currently, this is also used for parsing RFC 2109 cookies.
+
+ """
+ known_attrs = ("expires", "domain", "path", "secure",
+ # RFC 2109 attrs (may turn up in Netscape cookies, too)
+ "port", "max-age")
+
+ result = []
+ for ns_header in ns_headers:
+ pairs = []
+ version_set = False
+ params = re.split(r";\s*", ns_header)
+ for ii in range(len(params)):
+ param = params[ii]
+ param = string.rstrip(param)
+ if param == "": continue
+ if "=" not in param:
+ k, v = param, None
+ else:
+ k, v = re.split(r"\s*=\s*", param, 1)
+ k = string.lstrip(k)
+ if ii != 0:
+ lc = string.lower(k)
+ if lc in known_attrs:
+ k = lc
+ if k == "version":
+ # This is an RFC 2109 cookie.
+ version_set = True
+ if k == "expires":
+ # convert expires date to seconds since epoch
+ if startswith(v, '"'): v = v[1:]
+ if endswith(v, '"'): v = v[:-1]
+ v = http2time(v) # None if invalid
+ pairs.append((k, v))
+
+ if pairs:
+ if not version_set:
+ pairs.append(("version", "0"))
+ result.append(pairs)
+
+ return result
+
+
+def _test():
+ import doctest, _HeadersUtil
+ return doctest.testmod(_HeadersUtil)
+
+if __name__ == "__main__":
+ _test()
Added: Zope3/trunk/src/ClientCookie/_LWPCookieJar.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_LWPCookieJar.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_LWPCookieJar.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,188 @@
+"""Load / save to libwww-perl (LWP) format files.
+
+Actually, the format is slightly extended from that used by LWP's
+(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
+not recorded by LWP.
+
+It uses the version string "2.0", though really there isn't an LWP Cookies
+2.0 format. This indicates that there is extra information in here
+(domain_dot and port_spec) while still being compatible with libwww-perl,
+I hope.
+
+Copyright 2002-2004 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import time, re, string
+from _ClientCookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _HeadersUtil import join_header_words, split_header_words
+from _Util import startswith, iso2time, time2isoz
+from _Debug import getLogger
+debug = getLogger("ClientCookie").debug
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+def lwp_cookie_str(cookie):
+ """Return string representation of Cookie in an the LWP cookie file format.
+
+ Actually, the format is extended a bit -- see module docstring.
+
+ """
+ h = [(cookie.name, cookie.value),
+ ("path", cookie.path),
+ ("domain", cookie.domain)]
+ if cookie.port is not None: h.append(("port", cookie.port))
+ if cookie.path_specified: h.append(("path_spec", None))
+ if cookie.port_specified: h.append(("port_spec", None))
+ if cookie.domain_initial_dot: h.append(("domain_dot", None))
+ if cookie.secure: h.append(("secure", None))
+ if cookie.expires: h.append(("expires",
+ time2isoz(float(cookie.expires))))
+ if cookie.discard: h.append(("discard", None))
+ if cookie.comment: h.append(("comment", cookie.comment))
+ if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
+ if cookie.rfc2109: h.append(("rfc2109", None))
+
+ keys = cookie.nonstandard_attr_keys()
+ keys.sort()
+ for k in keys:
+ h.append((k, str(cookie.get_nonstandard_attr(k))))
+
+ h.append(("version", str(cookie.version)))
+
+ return join_header_words([h])
+
+class LWPCookieJar(FileCookieJar):
+ """
+ The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
+ "Set-Cookie3" is the format used by the libwww-perl libary, not known
+ to be compatible with any browser, but which is easy to read and
+ doesn't lose information about RFC 2965 cookies.
+
+ Additional methods
+
+ as_lwp_str(ignore_discard=True, ignore_expired=True)
+
+ """
+
+ magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
+
+ def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
+ """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
+
+ ignore_discard and ignore_expires: see docstring for FileCookieJar.save
+
+ """
+ now = time.time()
+ r = []
+ for cookie in self:
+ if not ignore_discard and cookie.discard:
+ debug(" Not saving %s: marked for discard", cookie.name)
+ continue
+ if not ignore_expires and cookie.is_expired(now):
+ debug(" Not saving %s: expired", cookie.name)
+ continue
+ r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
+ return string.join(r+[""], "\n")
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename, "w")
+ try:
+ debug("Saving LWP cookies file")
+ # There really isn't an LWP Cookies 2.0 format, but this indicates
+ # that there is extra information in here (domain_dot and
+ # port_spec) while still being compatible with libwww-perl, I hope.
+ f.write("#LWP-Cookies-2.0\n")
+ f.write(self.as_lwp_str(ignore_discard, ignore_expires))
+ finally:
+ f.close()
+
+ def _really_load(self, f, filename, ignore_discard, ignore_expires):
+ magic = f.readline()
+ if not re.search(self.magic_re, magic):
+ msg = "%s does not seem to contain cookies" % filename
+ raise LoadError(msg)
+
+ now = time.time()
+
+ header = "Set-Cookie3:"
+ boolean_attrs = ("port_spec", "path_spec", "domain_dot",
+ "secure", "discard", "rfc2109")
+ value_attrs = ("version",
+ "port", "path", "domain",
+ "expires",
+ "comment", "commenturl")
+
+ try:
+ while 1:
+ line = f.readline()
+ if line == "": break
+ if not startswith(line, header):
+ continue
+ line = string.strip(line[len(header):])
+
+ for data in split_header_words([line]):
+ name, value = data[0]
+ standard = {}
+ rest = {}
+ for k in boolean_attrs:
+ standard[k] = False
+ for k, v in data[1:]:
+ if k is not None:
+ lc = string.lower(k)
+ else:
+ lc = None
+ # don't lose case distinction for unknown fields
+ if (lc in value_attrs) or (lc in boolean_attrs):
+ k = lc
+ if k in boolean_attrs:
+ if v is None: v = True
+ standard[k] = v
+ elif k in value_attrs:
+ standard[k] = v
+ else:
+ rest[k] = v
+
+ h = standard.get
+ expires = h("expires")
+ discard = h("discard")
+ if expires is not None:
+ expires = iso2time(expires)
+ if expires is None:
+ discard = True
+ domain = h("domain")
+ domain_specified = startswith(domain, ".")
+ c = Cookie(h("version"), name, value,
+ h("port"), h("port_spec"),
+ domain, domain_specified, h("domain_dot"),
+ h("path"), h("path_spec"),
+ h("secure"),
+ expires,
+ discard,
+ h("comment"),
+ h("commenturl"),
+ rest,
+ h("rfc2109"),
+ )
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ self.set_cookie(c)
+ except:
+ reraise_unmasked_exceptions((IOError,))
+ raise LoadError("invalid Set-Cookie3 format file %s" % filename)
+
Added: Zope3/trunk/src/ClientCookie/_MSIECookieJar.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_MSIECookieJar.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_MSIECookieJar.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,393 @@
+"""Microsoft Internet Explorer cookie loading on Windows.
+
+Copyright 2002-2003 Johnny Lee <typo_pl at hotmail.com> (MSIE Perl code)
+Copyright 2002-2004 John J Lee <jjl at pobox.com> (The Python port)
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+# XXX names and comments are not great here
+
+import os, re, string, time, struct
+if os.name == "nt":
+ import _winreg
+
+from _ClientCookie import FileCookieJar, CookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _Util import startswith
+from _Debug import getLogger
+debug = getLogger("ClientCookie").debug
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+
+def regload(path, leaf):
+ key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
+ _winreg.KEY_ALL_ACCESS)
+ try:
+ value = _winreg.QueryValueEx(key, leaf)[0]
+ except WindowsError:
+ value = None
+ return value
+
+WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
+
+def epoch_time_offset_from_win32_filetime(filetime):
+ """Convert from win32 filetime to seconds-since-epoch value.
+
+ MSIE stores create and expire times as Win32 FILETIME, which is 64
+ bits of 100 nanosecond intervals since Jan 01 1601.
+
+ ClientCookie expects time in 32-bit value expressed in seconds since the
+ epoch (Jan 01 1970).
+
+ """
+ if filetime < WIN32_EPOCH:
+ raise ValueError("filetime (%d) is before epoch (%d)" %
+ (filetime, WIN32_EPOCH))
+
+ return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
+
+def binary_to_char(c): return "%02X" % ord(c)
+def binary_to_str(d): return string.join(map(binary_to_char, list(d)), "")
+
+class MSIEBase:
+ magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
+ padding = "\x0d\xf0\xad\x0b"
+
+ msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
+ cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
+ "(.+\@[\x21-\xFF]+\.txt)")
+
+ # path under HKEY_CURRENT_USER from which to get location of index.dat
+ reg_path = r"software\microsoft\windows" \
+ r"\currentversion\explorer\shell folders"
+ reg_key = "Cookies"
+
+ def __init__(self):
+ self._delayload_domains = {}
+
+ def _delayload_domain(self, domain):
+ # if necessary, lazily load cookies for this domain
+ delayload_info = self._delayload_domains.get(domain)
+ if delayload_info is not None:
+ cookie_file, ignore_discard, ignore_expires = delayload_info
+ try:
+ self.load_cookie_data(cookie_file,
+ ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ debug("error reading cookie file, skipping: %s", cookie_file)
+ else:
+ del self._delayload_domains[domain]
+
+ def _load_cookies_from_file(self, filename):
+ debug("Loading MSIE cookies file: %s", filename)
+ cookies = []
+
+ cookies_fh = open(filename)
+
+ try:
+ while 1:
+ key = cookies_fh.readline()
+ if key == "": break
+
+ rl = cookies_fh.readline
+ def getlong(rl=rl): return long(rl().rstrip())
+ def getstr(rl=rl): return rl().rstrip()
+
+ key = key.rstrip()
+ value = getstr()
+ domain_path = getstr()
+ flags = getlong() # 0x2000 bit is for secure I think
+ lo_expire = getlong()
+ hi_expire = getlong()
+ lo_create = getlong()
+ hi_create = getlong()
+ sep = getstr()
+
+ if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
+ hi_create, lo_create, sep) or (sep != "*"):
+ break
+
+ m = self.msie_domain_re.search(domain_path)
+ if m:
+ domain = m.group(1)
+ path = m.group(2)
+
+ cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain,
+ "PATH": path, "FLAGS": flags, "HIXP": hi_expire,
+ "LOXP": lo_expire, "HICREATE": hi_create,
+ "LOCREATE": lo_create})
+ finally:
+ cookies_fh.close()
+
+ return cookies
+
+ def load_cookie_data(self, filename,
+ ignore_discard=False, ignore_expires=False):
+ """Load cookies from file containing actual cookie data.
+
+ Old cookies are kept unless overwritten by newly loaded ones.
+
+ You should not call this method if the delayload attribute is set.
+
+ I think each of these files contain all cookies for one user, domain,
+ and path.
+
+ filename: file containing cookies -- usually found in a file like
+ C:\WINNT\Profiles\joe\Cookies\joe at blah[1].txt
+
+ """
+ now = int(time.time())
+
+ cookie_data = self._load_cookies_from_file(filename)
+
+ for cookie in cookie_data:
+ flags = cookie["FLAGS"]
+ secure = ((flags & 0x2000) != 0)
+ filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
+ expires = epoch_time_offset_from_win32_filetime(filetime)
+ if expires < now:
+ discard = True
+ else:
+ discard = False
+ domain = cookie["DOMAIN"]
+ initial_dot = startswith(domain, ".")
+ if initial_dot:
+ domain_specified = True
+ else:
+ # MSIE 5 does not record whether the domain cookie-attribute
+ # was specified.
+ # Assuming it wasn't is conservative, because with strict
+ # domain matching this will match less frequently; with regular
+ # Netscape tail-matching, this will match at exactly the same
+ # times that domain_specified = True would. It also means we
+ # don't have to prepend a dot to achieve consistency with our
+ # own & Mozilla's domain-munging scheme.
+ domain_specified = False
+
+ # assume path_specified is false
+ # XXX is there other stuff in here? -- eg. comment, commentURL?
+ c = Cookie(0,
+ cookie["KEY"], cookie["VALUE"],
+ None, False,
+ domain, domain_specified, initial_dot,
+ cookie["PATH"], False,
+ secure,
+ expires,
+ discard,
+ None,
+ None,
+ {"flags": flags})
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ CookieJar.set_cookie(self, c)
+
+ def load_from_registry(self, ignore_discard=False, ignore_expires=False,
+ username=None):
+ """
+ username: only required on win9x
+
+ """
+ cookies_dir = regload(self.reg_path, self.reg_key)
+ filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
+ self.load(filename, ignore_discard, ignore_expires, username)
+
+ def _really_load(self, index, filename, ignore_discard, ignore_expires,
+ username):
+ now = int(time.time())
+
+ if username is None:
+ username = string.lower(os.environ['USERNAME'])
+
+ cookie_dir = os.path.dirname(filename)
+
+ data = index.read(256)
+ if len(data) != 256:
+ raise LoadError("%s file is too short" % filename)
+
+ # Cookies' index.dat file starts with 32 bytes of signature
+ # followed by an offset to the first record, stored as a little-
+ # endian DWORD.
+ sig, size, data = data[:32], data[32:36], data[36:]
+ size = struct.unpack("<L", size)[0]
+
+ # check that sig is valid
+ if not self.magic_re.match(sig) or size != 0x4000:
+ raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
+ (str(filename), sig, size))
+
+ # skip to start of first record
+ index.seek(size, 0)
+
+ sector = 128 # size of sector in bytes
+
+ while 1:
+ data = ""
+
+ # Cookies are usually in two contiguous sectors, so read in two
+ # sectors and adjust if not a Cookie.
+ to_read = 2 * sector
+ d = index.read(to_read)
+ if len(d) != to_read:
+ break
+ data = data + d
+
+ # Each record starts with a 4-byte signature and a count
+ # (little-endian DWORD) of sectors for the record.
+ sig, size, data = data[:4], data[4:8], data[8:]
+ size = struct.unpack("<L", size)[0]
+
+ to_read = (size - 2) * sector
+
+## from urllib import quote
+## print "data", quote(data)
+## print "sig", quote(sig)
+## print "size in sectors", size
+## print "size in bytes", size*sector
+## print "size in units of 16 bytes", (size*sector) / 16
+## print "size to read in bytes", to_read
+## print
+
+ if sig != "URL ":
+ assert (sig in ("HASH", "LEAK",
+ self.padding, "\x00\x00\x00\x00"),
+ "unrecognized MSIE index.dat record: %s" %
+ binary_to_str(sig))
+ if sig == "\x00\x00\x00\x00":
+ # assume we've got all the cookies, and stop
+ break
+ if sig == self.padding:
+ continue
+ # skip the rest of this record
+ assert to_read >= 0
+ if size != 2:
+ assert to_read != 0
+ index.seek(to_read, 1)
+ continue
+
+ # read in rest of record if necessary
+ if size > 2:
+ more_data = index.read(to_read)
+ if len(more_data) != to_read: break
+ data = data + more_data
+
+ cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
+ "(%s\@[\x21-\xFF]+\.txt)" % username)
+ m = re.search(cookie_re, data, re.I)
+ if m:
+ cookie_file = os.path.join(cookie_dir, m.group(2))
+ if not self.delayload:
+ try:
+ self.load_cookie_data(cookie_file,
+ ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ debug("error reading cookie file, skipping: %s",
+ cookie_file)
+ else:
+ domain = m.group(1)
+ i = domain.find("/")
+ if i != -1:
+ domain = domain[:i]
+
+ self._delayload_domains[domain] = (
+ cookie_file, ignore_discard, ignore_expires)
+
+
+class MSIECookieJar(MSIEBase, FileCookieJar):
+ """FileCookieJar that reads from the Windows MSIE cookies database.
+
+ MSIECookieJar can read the cookie files of Microsoft Internet Explorer
+ (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
+ Windows 98. Other configurations may also work, but are untested. Saving
+ cookies in MSIE format is NOT supported. If you save cookies, they'll be
+ in the usual Set-Cookie3 format, which you can read back in using an
+ instance of the plain old CookieJar class. Don't save using the same
+ filename that you loaded cookies from, because you may succeed in
+ clobbering your MSIE cookies index file!
+
+ You should be able to have LWP share Internet Explorer's cookies like
+ this (note you need to supply a username to load_from_registry if you're on
+ Windows 9x or Windows ME):
+
+ cj = MSIECookieJar(delayload=1)
+ # find cookies index file in registry and load cookies from it
+ cj.load_from_registry()
+ opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj))
+ response = opener.open("http://example.com/")
+
+ Iterating over a delayloaded MSIECookieJar instance will not cause any
+ cookies to be read from disk. To force reading of all cookies from disk,
+ call read_all_cookies. Note that the following methods iterate over self:
+ clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
+ and as_string.
+
+ Additional methods:
+
+ load_from_registry(ignore_discard=False, ignore_expires=False,
+ username=None)
+ load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
+ read_all_cookies()
+
+ """
+ def __init__(self, filename=None, delayload=False, policy=None):
+ MSIEBase.__init__(self)
+ FileCookieJar.__init__(self, filename, delayload, policy)
+
+ def set_cookie(self, cookie):
+ if self.delayload:
+ self._delayload_domain(cookie.domain)
+ CookieJar.set_cookie(self, cookie)
+
+ def _cookies_for_request(self, request):
+ """Return a list of cookies to be returned to server."""
+ domains = self._cookies.copy()
+ domains.update(self._delayload_domains)
+ domains = domains.keys()
+
+ cookies = []
+ for domain in domains:
+ cookies.extend(self._cookies_for_domain(domain, request))
+ return cookies
+
+ def _cookies_for_domain(self, domain, request):
+ if not self._policy.domain_return_ok(domain, request):
+ return []
+ debug("Checking %s for cookies to return", domain)
+ if self.delayload:
+ self._delayload_domain(domain)
+ return CookieJar._cookies_for_domain(self, domain, request)
+
+ def read_all_cookies(self):
+ """Eagerly read in all cookies."""
+ if self.delayload:
+ for domain in self._delayload_domains.keys():
+ self._delayload_domain(domain)
+
+ def load(self, filename, ignore_discard=False, ignore_expires=False,
+ username=None):
+ """Load cookies from an MSIE 'index.dat' cookies index file.
+
+ filename: full path to cookie index file
+ username: only required on win9x
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ index = open(filename, "rb")
+
+ try:
+ self._really_load(index, filename, ignore_discard, ignore_expires,
+ username)
+ finally:
+ index.close()
Added: Zope3/trunk/src/ClientCookie/_MSIEDBCookieJar.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_MSIEDBCookieJar.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_MSIEDBCookieJar.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,140 @@
+"""Persistent CookieJar based on MS Internet Explorer cookie database.
+
+Copyright 2003-2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+**********************************************************************
+THIS DOESN'T WORK!
+
+It's just a sketch, to check the base class is OK.
+
+**********************************************************************
+
+"""
+
+from ClientCookie import MSIEBase, CookieJar
+from _Util import time2netscape
+
+def set_cookie_hdr_from_cookie(cookie):
+ params = []
+ if cookie.name is not None:
+ params.append("%s=%s" % cookie.name, cookie.value)
+ else:
+ params.append(cookie.name)
+ if cookie.expires:
+ params.append("expires=" % time2netscape(cookie.expires))
+ if cookie.domain_specified:
+ params.append("Domain=%s" % cookie.domain)
+ if cookie.path_specified:
+ params.append("path=%s" % cookie.path)
+ if cookie.port_specified:
+ if cookie.port is None:
+ params.append("Port")
+ else:
+ params.append("Port=%s" % cookie.port)
+ if cookie.secure:
+ params.append("secure")
+## if cookie.comment:
+## params.append("Comment=%s" % cookie.comment)
+## if cookie.comment_url:
+## params.append("CommentURL=%s" % cookie.comment_url)
+ return "; ".join(params)
+
+class MSIEDBCookieJar(MSIEBase, CookieJar):
+ """A CookieJar that relies on MS Internet Explorer's cookie database.
+
+ XXX Require ctypes or write C extension? win32all probably requires
+ latter.
+
+ **********************************************************************
+ THIS DOESN'T WORK!
+
+ It's just a sketch, to check the base class is OK.
+
+ **********************************************************************
+
+ MSIEDBCookieJar, unlike MSIECookieJar, keeps no state for itself, but
+ relies on the MS Internet Explorer's cookie database. It uses the win32
+ API functions InternetGetCookie() and InternetSetCookie(), from the wininet
+ library.
+
+ Note that MSIE itself may impose additional conditions on cookie processing
+ on top of that done by CookiePolicy. For cookie setting, the class tries
+ to foil that by providing the request details and Set-Cookie header it
+ thinks MSIE wants to see. For returning cookies to the server, it's up to
+ MSIE.
+
+ Note that session cookies ARE NOT written to disk and won't be accessible
+ from other processes. .clear_session_cookies() has no effect.
+
+ .clear_expired_cookies() has no effect: MSIE is responsible for this.
+
+ .clear() will raise NotImplementedError unless all three arguments are
+ given.
+
+ """
+ def __init__(self, policy=None):
+ MSIEBase.__init__(self)
+ FileCookieJar.__init__(self, policy)
+ def clear_session_cookies(self): pass
+ def clear_expired_cookies(self): pass
+ def clear(self, domain=None, path=None, name=None):
+ if None in [domain, path, name]:
+ raise NotImplementedError()
+ # XXXX
+ url = self._fake_url(domain, path)
+ hdr = "%s=; domain=%s; path=%s; max-age=0" % (name, domain, path)
+ r = windll.InternetSetCookie(url, None, hdr)
+ # XXX return value of InternetSetCookie?
+ def _fake_url(self, domain, path):
+ # to convince MSIE that Set-Cookie is OK
+ return "http://%s%s" % (domain, path)
+ def set_cookie(self, cookie):
+ # XXXX
+ url = self._fake_url(cookie.domain, cookie.path)
+ r = windll.InternetSetCookie(
+ url, None, set_cookie_hdr_from_cookie(cookie))
+ # XXX return value of InternetSetCookie?
+ def add_cookie_header(self, request, unverifiable=False):
+ # XXXX
+ cookie_header = windll.InternetGetCookie(request.get_full_url())
+ # XXX return value of InternetGetCookie?
+ request.add_unredirected_header(cookie_header)
+ def __iter__(self):
+ self._load_index_dat()
+ return CookieJar.__iter__(self)
+ def _cookies_for_request(self, request):
+ raise NotImplementedError() # XXXX
+ def _cookies_for_domain(self, domain, request):
+ #raise NotImplementedError() # XXXX
+ debug("Checking %s for cookies to return", domain)
+ if not self._policy.domain_return_ok(domain, request):
+ return []
+
+ # XXXX separate out actual loading of cookie data, so only index.dat is
+ # read in ._load_index_dat(), and ._really_load() calls that, then
+ # ._delayload_domain for all domains if not self.delayload.
+ # We then just call ._load_index_dat()
+ self._delayload = False
+ self._really_load()
+
+ cookies_by_path = self._cookies.get(domain)
+ if cookies_by_path is None:
+ return []
+
+ cookies = []
+ for path in cookies_by_path.keys():
+ if not self._policy.path_return_ok(path, request, unverifiable):
+ continue
+ for name, cookie in cookies_by_path[path].items():
+ if not self._policy.return_ok(cookie, request, unverifiable):
+ debug(" not returning cookie")
+ continue
+ debug(" it's a match")
+ cookies.append(cookie)
+
+ return cookies
+
Added: Zope3/trunk/src/ClientCookie/_MozillaCookieJar.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_MozillaCookieJar.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_MozillaCookieJar.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,173 @@
+"""Mozilla / Netscape cookie loading / saving.
+
+Copyright 2002-2004 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import re, string, time
+
+from _ClientCookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _Util import startswith, endswith
+from _Debug import getLogger
+debug = getLogger("ClientCookie").debug
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+try: issubclass(Exception(), (Exception,))
+except TypeError:
+ real_issubclass = issubclass
+ from _Util import compat_issubclass
+ issubclass = compat_issubclass
+ del compat_issubclass
+
+
+class MozillaCookieJar(FileCookieJar):
+ """
+
+ WARNING: you may want to backup your browser's cookies file if you use
+ this class to save cookies. I *think* it works, but there have been
+ bugs in the past!
+
+ This class differs from CookieJar only in the format it uses to save and
+ load cookies to and from a file. This class uses the Mozilla/Netscape
+ `cookies.txt' format. lynx uses this file format, too.
+
+ Don't expect cookies saved while the browser is running to be noticed by
+ the browser (in fact, Mozilla on unix will overwrite your saved cookies if
+ you change them on disk while it's running; on Windows, you probably can't
+ save at all while the browser is running).
+
+ Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
+ Netscape cookies on saving.
+
+ In particular, the cookie version and port number information is lost,
+ together with information about whether or not Path, Port and Discard were
+ specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
+ domain as set in the HTTP header started with a dot (yes, I'm aware some
+ domains in Netscape files start with a dot and some don't -- trust me, you
+ really don't want to know any more about this).
+
+ Note that though Mozilla and Netscape use the same format, they use
+ slightly different headers. The class saves cookies using the Netscape
+ header by default (Mozilla can cope with that).
+
+ """
+ magic_re = "#( Netscape)? HTTP Cookie File"
+ header = """\
+ # Netscape HTTP Cookie File
+ # http://www.netscape.com/newsref/std/cookie_spec.html
+ # This is a generated file! Do not edit.
+
+"""
+
+ def _really_load(self, f, filename, ignore_discard, ignore_expires):
+ now = time.time()
+
+ magic = f.readline()
+ if not re.search(self.magic_re, magic):
+ f.close()
+ raise LoadError(
+ "%s does not look like a Netscape format cookies file" %
+ filename)
+
+ try:
+ while 1:
+ line = f.readline()
+ if line == "": break
+
+ # last field may be absent, so keep any trailing tab
+ if endswith(line, "\n"): line = line[:-1]
+
+ # skip comments and blank lines XXX what is $ for?
+ if (startswith(string.strip(line), "#") or
+ startswith(string.strip(line), "$") or
+ string.strip(line) == ""):
+ continue
+
+ domain, domain_specified, path, secure, expires, name, value = \
+ string.split(line, "\t")
+ secure = (secure == "TRUE")
+ domain_specified = (domain_specified == "TRUE")
+ if name == "":
+ name = value
+ value = None
+
+ initial_dot = startswith(domain, ".")
+ assert domain_specified == initial_dot
+
+ discard = False
+ if expires == "":
+ expires = None
+ discard = True
+
+ # assume path_specified is false
+ c = Cookie(0, name, value,
+ None, False,
+ domain, domain_specified, initial_dot,
+ path, False,
+ secure,
+ expires,
+ discard,
+ None,
+ None,
+ {})
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ self.set_cookie(c)
+
+ except:
+ reraise_unmasked_exceptions((IOError,))
+ raise LoadError("invalid Netscape format file %s: %s" %
+ (filename, line))
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename, "w")
+ try:
+ debug("Saving Netscape cookies.txt file")
+ f.write(self.header)
+ now = time.time()
+ for cookie in self:
+ if not ignore_discard and cookie.discard:
+ debug(" Not saving %s: marked for discard", cookie.name)
+ continue
+ if not ignore_expires and cookie.is_expired(now):
+ debug(" Not saving %s: expired", cookie.name)
+ continue
+ if cookie.secure: secure = "TRUE"
+ else: secure = "FALSE"
+ if startswith(cookie.domain, "."): initial_dot = "TRUE"
+ else: initial_dot = "FALSE"
+ if cookie.expires is not None:
+ expires = str(cookie.expires)
+ else:
+ expires = ""
+ if cookie.value is None:
+ # cookies.txt regards 'Set-Cookie: foo' as a cookie
+ # with no name, whereas cookielib regards it as a
+ # cookie with no value.
+ name = ""
+ value = cookie.name
+ else:
+ name = cookie.name
+ value = cookie.value
+ f.write(
+ string.join([cookie.domain, initial_dot, cookie.path,
+ secure, expires, name, value], "\t")+
+ "\n")
+ finally:
+ f.close()
Added: Zope3/trunk/src/ClientCookie/_Opener.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_Opener.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_Opener.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,197 @@
+"""Integration with Python standard library module urllib2: OpenerDirector
+class.
+
+Copyright 2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+import urllib2, string, bisect, urlparse
+
+from _Util import startswith, isstringlike
+from _Request import Request
+
+def methnames(obj):
+ """Return method names of class instance.
+
+ dir(obj) doesn't work across Python versions, this does.
+
+ """
+ return methnames_of_instance_as_dict(obj).keys()
+
+def methnames_of_instance_as_dict(inst):
+ names = {}
+ names.update(methnames_of_class_as_dict(inst.__class__))
+ for methname in dir(inst):
+ candidate = getattr(inst, methname)
+ if callable(candidate):
+ names[methname] = None
+ return names
+
+def methnames_of_class_as_dict(klass):
+ names = {}
+ for methname in dir(klass):
+ candidate = getattr(klass, methname)
+ if callable(candidate):
+ names[methname] = None
+ for baseclass in klass.__bases__:
+ names.update(methnames_of_class_as_dict(baseclass))
+ return names
+
+
+class OpenerMixin:
+ def _request(self, url_or_req, data):
+ if isstringlike(url_or_req):
+ req = Request(url_or_req, data)
+ else:
+ # already a urllib2.Request or ClientCookie.Request instance
+ req = url_or_req
+ if data is not None:
+ req.add_data(data)
+ return req
+
+ def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
+ """Returns (filename, headers).
+
+ For remote objects, the default filename will refer to a temporary
+ file.
+
+ """
+ req = self._request(fullurl, data)
+ type_ = req.get_type()
+ fp = self.open(req)
+ headers = fp.info()
+ if filename is None and type == 'file':
+ return url2pathname(req.get_selector()), headers
+ if filename:
+ tfp = open(filename, 'wb')
+ else:
+ path = urlparse(fullurl)[2]
+ suffix = os.path.splitext(path)[1]
+ tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+ result = filename, headers
+ bs = 1024*8
+ size = -1
+ read = 0
+ blocknum = 1
+ if reporthook:
+ if headers.has_key("content-length"):
+ size = int(headers["Content-Length"])
+ reporthook(0, bs, size)
+ while 1:
+ block = fp.read(bs)
+ read += len(block)
+ if reporthook:
+ reporthook(blocknum, bs, size)
+ blocknum = blocknum + 1
+ if not block:
+ break
+ tfp.write(block)
+ fp.close()
+ tfp.close()
+ del fp
+ del tfp
+ if size>=0 and read<size:
+ raise IOError("incomplete retrieval error",
+ "got only %d bytes out of %d" % (read,size))
+ return result
+
+
+class OpenerDirector(urllib2.OpenerDirector, OpenerMixin):
+ def __init__(self):
+ urllib2.OpenerDirector.__init__(self)
+ self.process_response = {}
+ self.process_request = {}
+
+ def add_handler(self, handler):
+ added = False
+ for meth in methnames(handler):
+ i = string.find(meth, "_")
+ protocol = meth[:i]
+ condition = meth[i+1:]
+
+ if startswith(condition, "error"):
+ j = string.find(meth[i+1:], "_") + i + 1
+ kind = meth[j+1:]
+ try:
+ kind = int(kind)
+ except ValueError:
+ pass
+ lookup = self.handle_error.get(protocol, {})
+ self.handle_error[protocol] = lookup
+ elif (condition == "open" and
+ protocol not in ["do", "proxy"]): # hack -- see below
+ kind = protocol
+ lookup = self.handle_open
+ elif (condition in ["response", "request"] and
+ protocol != "redirect"): # yucky hack
+ # hack above is to fix HTTPRedirectHandler problem, which
+ # appears to above line to be a processor because of the
+ # redirect_request method :-((
+ kind = protocol
+ lookup = getattr(self, "process_"+condition)
+ else:
+ continue
+
+ if lookup.has_key(kind):
+ bisect.insort(lookup[kind], handler)
+ else:
+ lookup[kind] = [handler]
+ added = True
+ continue
+
+ if added:
+ # XXX why does self.handlers need to be sorted?
+ bisect.insort(self.handlers, handler)
+ handler.add_parent(self)
+
+ def open(self, fullurl, data=None):
+ req = self._request(fullurl, data)
+ type_ = req.get_type()
+
+ # pre-process request
+ # XXX should we allow a Processor to change the type (URL
+ # scheme) of the request?
+ meth_name = type_+"_request"
+ for processor in self.process_request.get(type_, []):
+ meth = getattr(processor, meth_name)
+ req = meth(req)
+
+ response = urllib2.OpenerDirector.open(self, req, data)
+
+ # post-process response
+ meth_name = type_+"_response"
+ for processor in self.process_response.get(type_, []):
+ meth = getattr(processor, meth_name)
+ response = meth(req, response)
+
+ return response
+
+ def error(self, proto, *args):
+ if proto in ['http', 'https']:
+ # XXX http[s] protocols are special-cased
+ dict = self.handle_error['http'] # https is not different than http
+ proto = args[2] # YUCK!
+ meth_name = 'http_error_%s' % proto
+ http_err = 1
+ orig_args = args
+ else:
+ dict = self.handle_error
+ meth_name = proto + '_error'
+ http_err = 0
+ args = (dict, proto, meth_name) + args
+ result = apply(self._call_chain, args)
+ if result:
+ return result
+
+ if http_err:
+ args = (dict, 'default', 'http_error_default') + orig_args
+ return apply(self._call_chain, args)
Added: Zope3/trunk/src/ClientCookie/_Request.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_Request.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_Request.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,73 @@
+"""Integration with Python standard library module urllib2: Request class.
+
+Copyright 2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+import urllib2, string
+
+from _ClientCookie import request_host
+
+
+class Request(urllib2.Request):
+ def __init__(self, url, data=None, headers={},
+ origin_req_host=None, unverifiable=False):
+ urllib2.Request.__init__(self, url, data, headers)
+ self.unredirected_hdrs = {}
+
+ # All the terminology below comes from RFC 2965.
+ self.unverifiable = unverifiable
+ # Set request-host of origin transaction.
+ # The origin request-host is needed in order to decide whether
+ # unverifiable sub-requests (automatic redirects, images embedded
+ # in HTML, etc.) are to third-party hosts. If they are, the
+ # resulting transactions might need to be conducted with cookies
+ # turned off.
+ if origin_req_host is None:
+ origin_req_host = request_host(self)
+ self.origin_req_host = origin_req_host
+
+ def get_origin_req_host(self):
+ return self.origin_req_host
+
+ def is_unverifiable(self):
+ return self.unverifiable
+
+ def add_unredirected_header(self, key, val):
+ """Add a header that will not be added to a redirected request."""
+ self.unredirected_hdrs[string.capitalize(key)] = val
+
+ def has_header(self, header_name):
+ """True iff request has named header (regular or unredirected)."""
+ if (self.headers.has_key(header_name) or
+ self.unredirected_hdrs.has_key(header_name)):
+ return True
+ return False
+
+ def get_header(self, header_name, default=None):
+ return self.headers.get(
+ header_name,
+ self.unredirected_hdrs.get(header_name, default))
+
+ def header_items(self):
+ hdrs = self.unredirected_hdrs.copy()
+ hdrs.update(self.headers)
+ return hdrs.items()
+
+ def __str__(self):
+ return "<Request for %s>" % self.get_full_url()
+
+ def get_method(self):
+ if self.has_data():
+ return "POST"
+ else:
+ return "GET"
Added: Zope3/trunk/src/ClientCookie/_Util.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_Util.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_Util.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,563 @@
+"""Python backwards-compat., date/time routines, seekable file object wrapper.
+
+ Copyright 2002-2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+import re, string, time
+from types import TupleType
+from StringIO import StringIO
+
+try:
+ from exceptions import StopIteration
+except ImportError:
+ from ClientCookie._ClientCookie import StopIteration
+
+def startswith(string, initial):
+ if len(initial) > len(string): return False
+ return string[:len(initial)] == initial
+
+def endswith(string, final):
+ if len(final) > len(string): return False
+ return string[-len(final):] == final
+
+def compat_issubclass(obj, tuple_or_class):
+ # for 2.1 and below
+ if type(tuple_or_class) == TupleType:
+ for klass in tuple_or_class:
+ if issubclass(obj, klass):
+ return True
+ return False
+ return issubclass(obj, tuple_or_class)
+
+def compat_isinstance(obj, tuple_or_class):
+ # for 2.1 and below
+ if type(tuple_or_class) == TupleType:
+ for klass in tuple_or_class:
+ if isinstance(obj, klass):
+ return True
+ return False
+ return isinstance(obj, tuple_or_class)
+
+def isstringlike(x):
+ try: x+""
+ except: return False
+ else: return True
+
+SPACE_DICT = {}
+for c in string.whitespace:
+ SPACE_DICT[c] = None
+del c
+def isspace(string):
+ for c in string:
+ if not SPACE_DICT.has_key(c): return False
+ return True
+
+# this is here rather than in _HeadersUtil as it's just for
+# compatibility with old Python versions, rather than entirely new code
+def getheaders(msg, name):
+ """Get all values for a header.
+
+ This returns a list of values for headers given more than once; each
+ value in the result list is stripped in the same way as the result of
+ getheader(). If the header is not given, return an empty list.
+ """
+ result = []
+ current = ''
+ have_header = 0
+ for s in msg.getallmatchingheaders(name):
+ if isspace(s[0]):
+ if current:
+ current = "%s\n %s" % (current, string.strip(s))
+ else:
+ current = string.strip(s)
+ else:
+ if have_header:
+ result.append(current)
+ current = string.strip(s[string.find(s, ":") + 1:])
+ have_header = 1
+ if have_header:
+ result.append(current)
+ return result
+
+try:
+ from calendar import timegm
+ timegm((2045, 1, 1, 22, 23, 32)) # overflows in 2.1
+except:
+ # Number of days per month (except for February in leap years)
+ mdays = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+
+ # Return 1 for leap years, 0 for non-leap years
+ def isleap(year):
+ return year % 4 == 0 and (year % 100 <> 0 or year % 400 == 0)
+
+ # Return number of leap years in range [y1, y2)
+ # Assume y1 <= y2 and no funny (non-leap century) years
+ def leapdays(y1, y2):
+ return (y2+3)/4 - (y1+3)/4
+
+ EPOCH = 1970
+ def timegm(tuple):
+ """Unrelated but handy function to calculate Unix timestamp from GMT."""
+ year, month, day, hour, minute, second = tuple[:6]
+ assert year >= EPOCH
+ assert 1 <= month <= 12
+ days = 365*(year-EPOCH) + leapdays(EPOCH, year)
+ for i in range(1, month):
+ days = days + mdays[i]
+ if month > 2 and isleap(year):
+ days = days + 1
+ days = days + day - 1
+ hours = days*24 + hour
+ minutes = hours*60 + minute
+ seconds = minutes*60L + second
+ return seconds
+
+
+# Date/time conversion routines for formats used by the HTTP protocol.
+
+EPOCH = 1970
+def my_timegm(tt):
+ year, month, mday, hour, min, sec = tt[:6]
+ if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
+ (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
+ return timegm(tt)
+ else:
+ return None
+
+days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+months_lower = []
+for month in months: months_lower.append(string.lower(month))
+
+
+def time2isoz(t=None):
+ """Return a string representing time in seconds since epoch, t.
+
+ If the function is called without an argument, it will use the current
+ time.
+
+ The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
+ representing Universal Time (UTC, aka GMT). An example of this format is:
+
+ 1994-11-24 08:49:37Z
+
+ """
+ if t is None: t = time.time()
+ year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
+ return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
+ year, mon, mday, hour, min, sec)
+
+def time2netscape(t=None):
+ """Return a string representing time in seconds since epoch, t.
+
+ If the function is called without an argument, it will use the current
+ time.
+
+ The format of the returned string is like this:
+
+ Wed, DD-Mon-YYYY HH:MM:SS GMT
+
+ """
+ if t is None: t = time.time()
+ year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
+ return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
+ days[wday], mday, months[mon-1], year, hour, min, sec)
+
+
+UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
+
+timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
+def offset_from_tz_string(tz):
+ offset = None
+ if UTC_ZONES.has_key(tz):
+ offset = 0
+ else:
+ m = timezone_re.search(tz)
+ if m:
+ offset = 3600 * int(m.group(2))
+ if m.group(3):
+ offset = offset + 60 * int(m.group(3))
+ if m.group(1) == '-':
+ offset = -offset
+ return offset
+
+def _str2time(day, mon, yr, hr, min, sec, tz):
+ # translate month name to number
+ # month numbers start with 1 (January)
+ try:
+ mon = months_lower.index(string.lower(mon))+1
+ except ValueError:
+ # maybe it's already a number
+ try:
+ imon = int(mon)
+ except ValueError:
+ return None
+ if 1 <= imon <= 12:
+ mon = imon
+ else:
+ return None
+
+ # make sure clock elements are defined
+ if hr is None: hr = 0
+ if min is None: min = 0
+ if sec is None: sec = 0
+
+ yr = int(yr)
+ day = int(day)
+ hr = int(hr)
+ min = int(min)
+ sec = int(sec)
+
+ if yr < 1000:
+ # find "obvious" year
+ cur_yr = time.localtime(time.time())[0]
+ m = cur_yr % 100
+ tmp = yr
+ yr = yr + cur_yr - m
+ m = m - tmp
+ if abs(m) > 50:
+ if m > 0: yr = yr + 100
+ else: yr = yr - 100
+
+ # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
+ t = my_timegm((yr, mon, day, hr, min, sec, tz))
+
+ if t is not None:
+ # adjust time using timezone string, to get absolute time since epoch
+ if tz is None:
+ tz = "UTC"
+ tz = string.upper(tz)
+ offset = offset_from_tz_string(tz)
+ if offset is None:
+ return None
+ t = t - offset
+
+ return t
+
+
+strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
+wkday_re = re.compile(
+ r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
+loose_http_re = re.compile(
+ r"""^
+ (\d\d?) # day
+ (?:\s+|[-\/])
+ (\w+) # month
+ (?:\s+|[-\/])
+ (\d+) # year
+ (?:
+ (?:\s+|:) # separator before clock
+ (\d\d?):(\d\d) # hour:min
+ (?::(\d\d))? # optional seconds
+ )? # optional clock
+ \s*
+ ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
+ \s*
+ (?:\(\w+\))? # ASCII representation of timezone in parens.
+ \s*$""", re.X)
+def http2time(text):
+ """Returns time in seconds since epoch of time represented by a string.
+
+ Return value is an integer.
+
+ None is returned if the format of str is unrecognized, the time is outside
+ the representable range, or the timezone string is not recognized. If the
+ string contains no timezone, UTC is assumed.
+
+ The timezone in the string may be numerical (like "-0800" or "+0100") or a
+ string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
+ timezone strings equivalent to UTC (zero offset) are known to the function.
+
+ The function loosely parses the following formats:
+
+ Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
+ Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
+ Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
+ 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
+ 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
+ 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
+
+ The parser ignores leading and trailing whitespace. The time may be
+ absent.
+
+ If the year is given with only 2 digits, the function will select the
+ century that makes the year closest to the current date.
+
+ """
+ # fast exit for strictly conforming string
+ m = strict_re.search(text)
+ if m:
+ g = m.groups()
+ mon = months_lower.index(string.lower(g[1])) + 1
+ tt = (int(g[2]), mon, int(g[0]),
+ int(g[3]), int(g[4]), float(g[5]))
+ return my_timegm(tt)
+
+ # No, we need some messy parsing...
+
+ # clean up
+ text = string.lstrip(text)
+ text = wkday_re.sub("", text, 1) # Useless weekday
+
+ # tz is time zone specifier string
+ day, mon, yr, hr, min, sec, tz = [None]*7
+
+ # loose regexp parse
+ m = loose_http_re.search(text)
+ if m is not None:
+ day, mon, yr, hr, min, sec, tz = m.groups()
+ else:
+ return None # bad format
+
+ return _str2time(day, mon, yr, hr, min, sec, tz)
+
+
+iso_re = re.compile(
+ """^
+ (\d{4}) # year
+ [-\/]?
+ (\d\d?) # numerical month
+ [-\/]?
+ (\d\d?) # day
+ (?:
+ (?:\s+|[-:Tt]) # separator before clock
+ (\d\d?):?(\d\d) # hour:min
+ (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
+ )? # optional clock
+ \s*
+ ([-+]?\d\d?:?(:?\d\d)?
+ |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
+ \s*$""", re.X)
+def iso2time(text):
+ """
+ As for http2time, but parses the ISO 8601 formats:
+
+ 1994-02-03 14:15:29 -0100 -- ISO 8601 format
+ 1994-02-03 14:15:29 -- zone is optional
+ 1994-02-03 -- only date
+ 1994-02-03T14:15:29 -- Use T as separator
+ 19940203T141529Z -- ISO 8601 compact format
+ 19940203 -- only date
+
+ """
+ # clean up
+ text = string.lstrip(text)
+
+ # tz is time zone specifier string
+ day, mon, yr, hr, min, sec, tz = [None]*7
+
+ # loose regexp parse
+ m = iso_re.search(text)
+ if m is not None:
+ # XXX there's an extra bit of the timezone I'm ignoring here: is
+ # this the right thing to do?
+ yr, mon, day, hr, min, sec, tz, _ = m.groups()
+ else:
+ return None # bad format
+
+ return _str2time(day, mon, yr, hr, min, sec, tz)
+
+
+
+# XXX Andrew Dalke kindly sent me a similar class in response to my request on
+# comp.lang.python, which I then proceeded to lose. I wrote this class
+# instead, but I think he's released his code publicly since, could pinch the
+# tests from it, at least...
+class seek_wrapper:
+ """Adds a seek method to a file object.
+
+ This is only designed for seeking on readonly file-like objects.
+
+ Wrapped file-like object must have a read method. The readline method is
+ only supported if that method is present on the wrapped object. The
+ readlines method is always supported. xreadlines and iteration are
+ supported only for Python 2.2 and above.
+
+ Public attribute: wrapped (the wrapped file object).
+
+ WARNING: All other attributes of the wrapped object (ie. those that are not
+ one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
+ are passed through unaltered, which may or may not make sense for your
+ particular file object.
+
+ """
+ # General strategy is to check that cache is full enough, then delegate
+ # everything to the cache (self._cache, which is a StringIO.StringIO
+ # instance. Seems to be some cStringIO.StringIO problem on 1.5.2 -- I
+ # get a StringOobject, with no readlines method.
+
+ # Invariant: the end of the cache is always at the same place as the
+ # end of the wrapped file:
+ # self.wrapped.tell() == self.__cache.tell()
+
+ def __init__(self, wrapped):
+ self.wrapped = wrapped
+ self.__have_readline = hasattr(self.wrapped, "readline")
+ self.__cache = StringIO()
+
+ def __getattr__(self, name):
+ wrapped = self.__dict__.get("wrapped")
+ if wrapped:
+ return getattr(wrapped, name)
+ return getattr(self.__class__, name)
+
+ def seek(self, offset, whence=0):
+ # make sure we have read all data up to the point we are seeking to
+ pos = self.__cache.tell()
+ if whence == 0: # absolute
+ to_read = offset - pos
+ elif whence == 1: # relative to current position
+ to_read = offset
+ elif whence == 2: # relative to end of *wrapped* file
+ # since we don't know yet where the end of that file is, we must
+ # read everything
+ to_read = None
+ if to_read is None or to_read >= 0:
+ if to_read is None:
+ self.__cache.write(self.wrapped.read())
+ else:
+ self.__cache.write(self.wrapped.read(to_read))
+ self.__cache.seek(pos)
+
+ return self.__cache.seek(offset, whence)
+
+ def tell(self):
+ return self.__cache.tell()
+
+ def read(self, size=-1):
+ pos = self.__cache.tell()
+
+ self.__cache.seek(pos)
+
+ end = len(self.__cache.getvalue())
+ available = end - pos
+
+ # enough data already cached?
+ if size <= available and size != -1:
+ return self.__cache.read(size)
+
+ # no, so read sufficient data from wrapped file and cache it
+ to_read = size - available
+ assert to_read > 0 or size == -1
+ self.__cache.seek(0, 2)
+ if size == -1:
+ self.__cache.write(self.wrapped.read())
+ else:
+ self.__cache.write(self.wrapped.read(to_read))
+ self.__cache.seek(pos)
+
+ return self.__cache.read(size)
+
+ def readline(self, size=-1):
+ if not self.__have_readline:
+ raise NotImplementedError("no readline method on wrapped object")
+
+ # line we're about to read might not be complete in the cache, so
+ # read another line first
+ pos = self.__cache.tell()
+ self.__cache.seek(0, 2)
+ self.__cache.write(self.wrapped.readline())
+ self.__cache.seek(pos)
+
+ data = self.__cache.readline()
+ if size != -1:
+ r = data[:size]
+ self.__cache.seek(pos+size)
+ else:
+ r = data
+ return r
+
+ def readlines(self, sizehint=-1):
+ pos = self.__cache.tell()
+ self.__cache.seek(0, 2)
+ self.__cache.write(self.wrapped.read())
+ self.__cache.seek(pos)
+ try:
+ return self.__cache.readlines(sizehint)
+ except TypeError: # 1.5.2 hack
+ return self.__cache.readlines()
+
+ def __iter__(self): return self
+ def next(self):
+ line = self.readline()
+ if line == "": raise StopIteration
+ return line
+
+ xreadlines = __iter__
+
+ def __repr__(self):
+ return ("<%s at %s whose wrapped object = %s>" %
+ (self.__class__.__name__, `id(self)`, `self.wrapped`))
+
+ def close(self):
+ self.__cache = None
+ self.read = None
+ self.readline = None
+ self.readlines = None
+ self.seek = None
+ if self.wrapped: self.wrapped.close()
+ self.wrapped = None
+
+class eoffile:
+ # file-like object that always claims to be at end-of-file...
+ def read(self, size=-1): return ""
+ def readline(self, size=-1): return ""
+ # ...and also supports these response methods
+ def info(self):
+ return self.headers
+ def geturl(self):
+ return self.url
+
+
+class response_seek_wrapper(seek_wrapper):
+ """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
+
+ After .close():
+ , the following methods are supported:
+
+ .read()
+ .readline()
+ .readlines()
+ .seek()
+ .tell()
+ .info()
+ .geturl()
+ .__iter__()
+ .next()
+
+ Also supports pickling.
+
+ """
+
+ def close(self):
+ self.headers = self.wrapped.headers
+ self.url = self.wrapped.url
+ self.wrapped.close()
+ self.wrapped = eoffile()
+
+ def __getstate__(self):
+ # There are three obvious options here:
+ # 1. truncate
+ # 2. read to end
+ # 3. close socket, pickle state including read position, then open
+ # again on unpickle and use Range header
+
+ # 2 breaks pickle protocol, because one expects the original object
+ # to be left unscathed by pickling. 3 is too complicated and
+ # surprising (and too much work ;-) to happen in a sane __getstate__.
+ # So we do 1.
+
+ state = self.__dict__.copy()
+ state["wrapped"] = eoffile()
+ return state
Added: Zope3/trunk/src/ClientCookie/__init__.py
===================================================================
--- Zope3/trunk/src/ClientCookie/__init__.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/__init__.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,69 @@
+import sys
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+# If you hate the idea of turning bugs into warnings, do:
+# import ClientCookie; ClientCookie.USE_BARE_EXCEPT = False
+USE_BARE_EXCEPT = True
+WARNINGS_STREAM = sys.stdout
+
+# Import names so that they can be imported directly from the package, like
+# this:
+#from ClientCookie import <whatever>
+
+# These work like equivalents from logging. Use logging direct if you
+# have 2.3.
+from _Debug import getLogger, StreamHandler, NOTSET, INFO, DEBUG
+
+from _ClientCookie import VERSION, __doc__, \
+ Cookie, \
+ CookiePolicy, DefaultCookiePolicy, \
+ CookieJar, FileCookieJar, LoadError, request_host
+from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
+from _MozillaCookieJar import MozillaCookieJar
+from _MSIECookieJar import MSIECookieJar
+try:
+ import bsddb
+except ImportError:
+ pass
+else:
+ from _BSDDBCookieJar import BSDDBCookieJar, CreateBSDDBCookieJar
+#from _MSIEDBCookieJar import MSIEDBCookieJar
+from _ConnCache import ConnectionCache
+try:
+ from urllib2 import AbstractHTTPHandler
+except ImportError:
+ pass
+else:
+ from ClientCookie._urllib2_support import \
+ Request, \
+ OpenerDirector, build_opener, install_opener, urlopen, \
+ OpenerFactory, urlretrieve, BaseHandler, \
+ XHTMLCompatibleHeadParser, HeadParser
+ from ClientCookie._urllib2_support import \
+ HTTPHandler, HTTPRedirectHandler, \
+ HTTPRequestUpgradeProcessor, \
+ HTTPEquivProcessor, SeekableProcessor, HTTPCookieProcessor, \
+ HTTPRefererProcessor, \
+ HTTPRefreshProcessor, HTTPErrorProcessor, \
+ HTTPResponseDebugProcessor, HTTPRedirectDebugProcessor
+
+ try:
+ import robotparser
+ except ImportError:
+ pass
+ else:
+ from ClientCookie._urllib2_support import \
+ HTTPRobotRulesProcessor, RobotExclusionError
+ del robotparser
+
+ import httplib
+ if hasattr(httplib, 'HTTPS'):
+ from ClientCookie._urllib2_support import HTTPSHandler
+ del AbstractHTTPHandler, httplib
+from _Util import http2time
+str2time = http2time
+del http2time
Added: Zope3/trunk/src/ClientCookie/_urllib2_support.py
===================================================================
--- Zope3/trunk/src/ClientCookie/_urllib2_support.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientCookie/_urllib2_support.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,746 @@
+"""Integration with Python standard library module urllib2.
+
+Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
+the META HTTP-EQUIV tag contents, and following Refresh header redirects.
+
+Copyright 2002-2004 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import copy, time, tempfile
+
+import ClientCookie
+from _ClientCookie import CookieJar, request_host
+from _Util import isstringlike, startswith, getheaders
+from _HeadersUtil import is_html
+from _Debug import getLogger
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+
+CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
+
+try:
+ from urllib2 import AbstractHTTPHandler
+except ImportError:
+ pass
+else:
+ import urlparse, urllib2, urllib, httplib
+ import htmllib, sgmllib, formatter
+ from urllib2 import URLError, HTTPError
+ import types, string, socket
+ from cStringIO import StringIO
+ try:
+ import threading
+ _threading = threading; del threading
+ except ImportError:
+ import dummy_threading
+ _threading = dummy_threading; del dummy_threading
+
+ from _Util import response_seek_wrapper
+ from _Request import Request
+
+
+ class BaseHandler(urllib2.BaseHandler):
+ handler_order = 500
+
+ def __cmp__(self, other):
+ if not hasattr(other, "handler_order"):
+ # Try to preserve the old behavior of having custom classes
+ # inserted after default ones (works only for custom user
+ # classes which are not aware of handler_order).
+ return 0
+ return cmp(self.handler_order, other.handler_order)
+
+
+ # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
+ # (http://www.python.org/sf/549151)
+ # 2.2.3 is broken here (my fault!), 2.3 is fixed.
+ class HTTPRedirectHandler(BaseHandler):
+ # maximum number of redirections to any single URL
+ # this is needed because of the state that cookies introduce
+ max_repeats = 4
+ # maximum total number of redirections (regardless of URL) before
+ # assuming we're in a loop
+ max_redirections = 10
+
+ # Implementation notes:
+
+ # To avoid the server sending us into an infinite loop, the request
+ # object needs to track what URLs we have already seen. Do this by
+ # adding a handler-specific attribute to the Request object. The value
+ # of the dict is used to count the number of times the same URL has
+ # been visited. This is needed because visiting the same URL twice
+ # does not necessarily imply a loop, thanks to state introduced by
+ # cookies.
+
+ # Always unhandled redirection codes:
+ # 300 Multiple Choices: should not handle this here.
+ # 304 Not Modified: no need to handle here: only of interest to caches
+ # that do conditional GETs
+ # 305 Use Proxy: probably not worth dealing with here
+ # 306 Unused: what was this for in the previous versions of protocol??
+
+ def redirect_request(self, newurl, req, fp, code, msg, headers):
+ """Return a Request or None in response to a redirect.
+
+ This is called by the http_error_30x methods when a redirection
+ response is received. If a redirection should take place, return a
+ new Request to allow http_error_30x to perform the redirect;
+ otherwise, return None to indicate that an HTTPError should be
+ raised.
+
+ """
+ if code in (301, 302, 303, "refresh") or \
+ (code == 307 and not req.has_data()):
+ # Strictly (according to RFC 2616), 301 or 302 in response to
+ # a POST MUST NOT cause a redirection without confirmation
+ # from the user (of urllib2, in this case). In practice,
+ # essentially all clients do redirect in this case, so we do
+ # the same.
+ return Request(newurl,
+ headers=req.headers,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
+ else:
+ raise HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+ def http_error_302(self, req, fp, code, msg, headers):
+ # Some servers (incorrectly) return multiple Location headers
+ # (so probably same goes for URI). Use first header.
+ if headers.has_key('location'):
+ newurl = getheaders(headers, 'location')[0]
+ elif headers.has_key('uri'):
+ newurl = getheaders(headers, 'uri')[0]
+ else:
+ return
+ newurl = urlparse.urljoin(req.get_full_url(), newurl)
+
+ # XXX Probably want to forget about the state of the current
+ # request, although that might interact poorly with other
+ # handlers that also use handler-specific request attributes
+ new = self.redirect_request(newurl, req, fp, code, msg, headers)
+ if new is None:
+ return
+
+ # loop detection
+ # .redirect_dict has a key url if url was previously visited.
+ if hasattr(req, 'redirect_dict'):
+ visited = new.redirect_dict = req.redirect_dict
+ if (visited.get(newurl, 0) >= self.max_repeats or
+ len(visited) >= self.max_redirections):
+ raise HTTPError(req.get_full_url(), code,
+ self.inf_msg + msg, headers, fp)
+ else:
+ visited = new.redirect_dict = req.redirect_dict = {}
+ visited[newurl] = visited.get(newurl, 0) + 1
+
+ # Don't close the fp until we are sure that we won't use it
+ # with HTTPError.
+ fp.read()
+ fp.close()
+
+ return self.parent.open(new)
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_302
+ http_error_refresh = http_error_302
+
+ inf_msg = "The HTTP server returned a redirect error that would " \
+ "lead to an infinite loop.\n" \
+ "The last 30x error message was:\n"
+
+
+ class HTTPRequestUpgradeProcessor(BaseHandler):
+ # upgrade urllib2.Request to this module's Request
+ # yuck!
+ handler_order = 0 # before anything else
+
+ def http_request(self, request):
+ if not hasattr(request, "add_unredirected_header"):
+ newrequest = Request(request._Request__original, request.data,
+ request.headers)
+ try: newrequest.origin_req_host = request.origin_req_host
+ except AttributeError: pass
+ try: newrequest.unverifiable = request.unverifiable
+ except AttributeError: pass
+ request = newrequest
+ return request
+
+ https_request = http_request
+
+
+ # XXX would self.reset() work, instead of raising this exception?
+ class EndOfHeadError(Exception): pass
+ class AbstractHeadParser:
+ # only these elements are allowed in or before HEAD of document
+ head_elems = ("html", "head",
+ "title", "base",
+ "script", "style", "meta", "link", "object")
+
+ def __init__(self):
+ self.http_equiv = []
+ def start_meta(self, attrs):
+ http_equiv = content = None
+ for key, value in attrs:
+ if key == "http-equiv":
+ http_equiv = value
+ elif key == "content":
+ content = value
+ if http_equiv is not None:
+ self.http_equiv.append((http_equiv, content))
+
+ def end_head(self):
+ raise EndOfHeadError()
+
+ try:
+ import HTMLParser
+ except ImportError:
+ pass
+ else:
+ class XHTMLCompatibleHeadParser(AbstractHeadParser,
+ HTMLParser.HTMLParser):
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'start_' + tag)
+ except AttributeError:
+ try:
+ method = getattr(self, 'do_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method(attrs)
+ else:
+ method(attrs)
+
+ def handle_endtag(self, tag):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'end_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method()
+
+ # handle_charref, handle_entityref and default entitydefs are taken
+ # from sgmllib
+ def handle_charref(self, name):
+ try:
+ n = int(name)
+ except ValueError:
+ self.unknown_charref(name)
+ return
+ if not 0 <= n <= 255:
+ self.unknown_charref(name)
+ return
+ self.handle_data(chr(n))
+
+ # Definition of entities -- derived classes may override
+ entitydefs = \
+ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
+
+ def handle_entityref(self, name):
+ table = self.entitydefs
+ if name in table:
+ self.handle_data(table[name])
+ else:
+ self.unknown_entityref(name)
+ return
+
+ def unknown_entityref(self, ref):
+ self.handle_data("&%s;" % ref)
+
+ def unknown_charref(self, ref):
+ self.handle_data("&#%s;" % ref)
+
+ class HeadParser(AbstractHeadParser, htmllib.HTMLParser):
+ def __init__(self):
+ htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, method, attrs):
+ if tag in self.head_elems:
+ method(attrs)
+ else:
+ raise EndOfHeadError()
+
+ def handle_endtag(self, tag, method):
+ if tag in self.head_elems:
+ method()
+ else:
+ raise EndOfHeadError()
+
+ def parse_head(fileobj, parser):
+ """Return a list of key, value pairs."""
+ while 1:
+ data = fileobj.read(CHUNK)
+ try:
+ parser.feed(data)
+ except EndOfHeadError:
+ break
+ if len(data) != CHUNK:
+ # this should only happen if there is no HTML body, or if
+ # CHUNK is big
+ break
+ return parser.http_equiv
+
+ class HTTPEquivProcessor(BaseHandler):
+ """Append META HTTP-EQUIV headers to regular HTTP headers."""
+
+ def __init__(self, head_parser_class=HeadParser):
+ self.head_parser_class = head_parser_class
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ headers = response.info()
+ url = response.geturl()
+ ct_hdrs = getheaders(response.info(), "content-type")
+ if is_html(ct_hdrs, url):
+ try:
+ try:
+ html_headers = parse_head(response, self.head_parser_class())
+ finally:
+ response.seek(0)
+ except (HTMLParser.HTMLParseError,
+ sgmllib.SGMLParseError):
+ pass
+ else:
+ for hdr, val in html_headers:
+ # rfc822.Message interprets this as appending, not clobbering
+ headers[hdr] = val
+ return response
+
+ https_response = http_response
+
+ # XXX ATM this only takes notice of http responses -- probably
+ # should be independent of protocol scheme (http, ftp, etc.)
+ class SeekableProcessor(BaseHandler):
+ """Make responses seekable."""
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ return response_seek_wrapper(response)
+ return response
+
+ https_response = http_response
+
+ class HTTPCookieProcessor(BaseHandler):
+ """Handle HTTP cookies.
+
+ Public attributes:
+
+ cookiejar: CookieJar instance
+
+ """
+ def __init__(self, cookiejar=None):
+ if cookiejar is None:
+ cookiejar = CookieJar()
+ self.cookiejar = cookiejar
+
+ def http_request(self, request):
+ self.cookiejar.add_cookie_header(request)
+ return request
+
+ def http_response(self, request, response):
+ self.cookiejar.extract_cookies(response, request)
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+ try:
+ import robotparser
+ except ImportError:
+ pass
+ else:
+ class RobotExclusionError(urllib2.HTTPError):
+ def __init__(self, request, *args):
+ apply(urllib2.HTTPError.__init__, (self,)+args)
+ self.request = request
+
+ class HTTPRobotRulesProcessor(BaseHandler):
+ # before redirections and response debugging, after everything else
+ handler_order = 800
+
+ try:
+ from httplib import HTTPMessage
+ except:
+ from mimetools import Message
+ http_response_class = Message
+ else:
+ http_response_class = HTTPMessage
+
+ def __init__(self, rfp_class=robotparser.RobotFileParser):
+ self.rfp_class = rfp_class
+ self.rfp = None
+ self._host = None
+
+ def http_request(self, request):
+ host = request.get_host()
+ scheme = request.get_type()
+ if host != self._host:
+ self.rfp = self.rfp_class()
+ self.rfp.set_url(scheme+"://"+host+"/robots.txt")
+ self.rfp.read()
+ self._host = host
+
+ ua = request.get_header("User-agent", "")
+ if self.rfp.can_fetch(ua, request.get_full_url()):
+ return request
+ else:
+ msg = "request disallowed by robots.txt"
+ raise RobotExclusionError(
+ request,
+ request.get_full_url(),
+ 403, msg,
+ self.http_response_class(StringIO()), StringIO(msg))
+
+ https_request = http_request
+
+ class HTTPRefererProcessor(BaseHandler):
+ """Add Referer header to requests.
+
+ This only makes sense if you use each RefererProcessor for a single
+ chain of requests only (so, for example, if you use a single
+ HTTPRefererProcessor to fetch a series of URLs extracted from a single
+ page, this will break).
+
+ """
+ def __init__(self):
+ self.referer = None
+
+ def http_request(self, request):
+ if ((self.referer is not None) and
+ not request.has_header("Referer")):
+ request.add_unredirected_header("Referer", self.referer)
+ return request
+
+ def http_response(self, request, response):
+ self.referer = response.geturl()
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+ class HTTPResponseDebugProcessor(BaseHandler):
+ handler_order = 900 # before redirections, after everything else
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ info = getLogger("ClientCookie.http_responses").info
+ try:
+ info(response.read())
+ finally:
+ response.seek(0)
+ info("*****************************************************")
+ return response
+
+ https_response = http_response
+
+ class HTTPRedirectDebugProcessor(BaseHandler):
+ def http_request(self, request):
+ if hasattr(request, "redirect_dict"):
+ info = getLogger("ClientCookie.http_redirects").info
+ info("redirecting to %s", request.get_full_url())
+ return request
+
+ class HTTPRefreshProcessor(BaseHandler):
+ """Perform HTTP Refresh redirections.
+
+ Note that if a non-200 HTTP code has occurred (for example, a 30x
+ redirect), this processor will do nothing.
+
+ By default, only zero-time Refresh headers are redirected. Use the
+ max_time attribute / constructor argument to allow Refresh with longer
+ pauses. Use the honor_time attribute / constructor argument to control
+ whether the requested pause is honoured (with a time.sleep()) or
+ skipped in favour of immediate redirection.
+
+ Public attributes:
+
+ max_time: see above
+ honor_time: see above
+
+ """
+ handler_order = 1000
+
+ def __init__(self, max_time=0, honor_time=True):
+ self.max_time = max_time
+ self.honor_time = honor_time
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code == 200 and hdrs.has_key("refresh"):
+ refresh = getheaders(hdrs, "refresh")[0]
+ i = string.find(refresh, ";")
+ if i != -1:
+ pause, newurl_spec = refresh[:i], refresh[i+1:]
+ i = string.find(newurl_spec, "=")
+ if i != -1:
+ pause = int(pause)
+ if (self.max_time is None) or (pause <= self.max_time):
+ if pause != 0 and self.honor_time:
+ time.sleep(pause)
+ newurl = newurl_spec[i+1:]
+ hdrs["location"] = newurl
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response,
+ "refresh", msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+ class HTTPErrorProcessor(BaseHandler):
+ """Process HTTP error responses.
+
+ The purpose of this handler is to to allow other response processors a
+ look-in by removing the call to parent.error() from
+ AbstractHTTPHandler.
+
+ For non-200 error codes, this just passes the job on to the
+ Handler.<proto>_error_<code> methods, via the OpenerDirector.error
+ method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
+ HTTPError if no other handler handles the error.
+
+ """
+ handler_order = 1000 # after all other processors
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code != 200:
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response, code, msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+
+ class AbstractHTTPHandler(BaseHandler):
+
+ def __init__(self, debuglevel=0):
+ self._debuglevel = debuglevel
+
+ def set_http_debuglevel(self, level):
+ self._debuglevel = level
+
+ def do_request_(self, request):
+ host = request.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ if request.has_data(): # POST
+ data = request.get_data()
+ if not request.has_header('Content-type'):
+ request.add_unredirected_header(
+ 'Content-type',
+ 'application/x-www-form-urlencoded')
+
+ scheme, sel = urllib.splittype(request.get_selector())
+ sel_host, sel_path = urllib.splithost(sel)
+ if not request.has_header('Host'):
+ request.add_unredirected_header('Host', sel_host or host)
+ for name, value in self.parent.addheaders:
+ name = string.capitalize(name)
+ if not request.has_header(name):
+ request.add_unredirected_header(name, value)
+
+ return request
+
+ def do_open(self, http_class, req):
+ """Return an addinfourl object for the request, using http_class.
+
+ http_class must implement the HTTPConnection API from httplib.
+ The addinfourl return value is a file-like object. It also
+ has methods and attributes including:
+ - info(): return a mimetools.Message object for the headers
+ - geturl(): return the original request URL
+ - code: HTTP status code
+ """
+ host = req.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ h = http_class(host) # will parse host:port
+ h.set_debuglevel(self._debuglevel)
+
+ headers = req.headers.copy()
+ headers.update(req.unredirected_hdrs)
+ # We want to make an HTTP/1.1 request, but the addinfourl
+ # class isn't prepared to deal with a persistent connection.
+ # It will try to read all remaining data from the socket,
+ # which will block while the server waits for the next request.
+ # So make sure the connection gets closed after the (only)
+ # request.
+ headers["Connection"] = "close"
+ try:
+ h.request(req.get_method(), req.get_selector(), req.data, headers)
+ r = h.getresponse()
+ except socket.error, err: # XXX what error?
+ raise URLError(err)
+
+ # Pick apart the HTTPResponse object to get the addinfourl
+ # object initialized properly.
+
+ # Wrap the HTTPResponse object in socket's file object adapter
+ # for Windows. That adapter calls recv(), so delegate recv()
+ # to read(). This weird wrapping allows the returned object to
+ # have readline() and readlines() methods.
+
+ # XXX It might be better to extract the read buffering code
+ # out of socket._fileobject() and into a base class.
+
+ r.recv = r.read
+ fp = socket._fileobject(r, 'rb', -1)
+
+ resp = urllib.addinfourl(fp, r.msg, req.get_full_url())
+ resp.code = r.status
+ resp.msg = r.reason
+ return resp
+
+
+ class HTTPHandler(AbstractHTTPHandler):
+ def http_open(self, req):
+ return self.do_open(httplib.HTTPConnection, req)
+
+ http_request = AbstractHTTPHandler.do_request_
+
+ if hasattr(httplib, 'HTTPS'):
+ class HTTPSHandler(AbstractHTTPHandler):
+ def https_open(self, req):
+ return self.do_open(httplib.HTTPSConnection, req)
+
+ https_request = AbstractHTTPHandler.do_request_
+
+## class HTTPHandler(AbstractHTTPHandler):
+## def http_open(self, req):
+## return self.do_open(httplib.HTTP, req)
+
+## http_request = AbstractHTTPHandler.do_request_
+
+## if hasattr(httplib, 'HTTPS'):
+## class HTTPSHandler(AbstractHTTPHandler):
+## def https_open(self, req):
+## return self.do_open(httplib.HTTPS, req)
+
+## https_request = AbstractHTTPHandler.do_request_
+
+ if int(10*float(urllib2.__version__[:3])) >= 24:
+ # urllib2 supports processors already
+ from _Opener import OpenerMixin
+ class OpenerDirector(urllib2.OpenerDirector, OpenerMixin):
+ pass
+ else:
+ from _Opener import OpenerDirector
+
+ class OpenerFactory:
+ """This class's interface is quite likely to change."""
+
+ default_classes = [
+ # handlers
+ urllib2.ProxyHandler,
+ urllib2.UnknownHandler,
+ HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
+ urllib2.HTTPDefaultErrorHandler,
+ HTTPRedirectHandler, # from this module (bugfixed)
+ urllib2.FTPHandler,
+ urllib2.FileHandler,
+ # processors
+ HTTPRequestUpgradeProcessor,
+ #HTTPEquivProcessor,
+ #SeekableProcessor,
+ HTTPCookieProcessor,
+ #HTTPRefererProcessor,
+ #HTTPRefreshProcessor,
+ HTTPErrorProcessor
+ ]
+ handlers = []
+ replacement_handlers = []
+
+ def __init__(self, klass=OpenerDirector):
+ self.klass = klass
+
+ def build_opener(self, *handlers):
+ """Create an opener object from a list of handlers and processors.
+
+ The opener will use several default handlers and processors, including
+ support for HTTP and FTP.
+
+ If any of the handlers passed as arguments are subclasses of the
+ default handlers, the default handlers will not be used.
+
+ """
+ opener = self.klass()
+ default_classes = list(self.default_classes)
+ if hasattr(httplib, 'HTTPS'):
+ default_classes.append(HTTPSHandler)
+ skip = []
+ for klass in default_classes:
+ for check in handlers:
+ if type(check) == types.ClassType:
+ if issubclass(check, klass):
+ skip.append(klass)
+ elif type(check) == types.InstanceType:
+ if isinstance(check, klass):
+ skip.append(klass)
+ for klass in skip:
+ default_classes.remove(klass)
+
+ for klass in default_classes:
+ opener.add_handler(klass())
+ for h in handlers:
+ if type(h) == types.ClassType:
+ h = h()
+ opener.add_handler(h)
+
+ return opener
+
+ build_opener = OpenerFactory().build_opener
+
+ _opener = None
+ urlopen_lock = _threading.Lock()
+ def urlopen(url, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.open(url, data)
+
+ def urlretrieve(url, filename=None, reporthook=None, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.retrieve(url, filename, reporthook, data)
+
+ def install_opener(opener):
+ global _opener
+ _opener = opener
Added: Zope3/trunk/src/ClientForm.py
===================================================================
--- Zope3/trunk/src/ClientForm.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/ClientForm.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,3126 @@
+"""HTML form handling for web clients.
+
+ClientForm is a Python module for handling HTML forms on the client
+side, useful for parsing HTML forms, filling them in and returning the
+completed forms to the server. It has developed from a port of Gisle
+Aas' Perl module HTML::Form, from the libwww-perl library, but the
+interface is not the same.
+
+The most useful docstring is the one for HTMLForm.
+
+RFC 1866: HTML 2.0
+RFC 1867: Form-based File Upload in HTML
+RFC 2388: Returning Values from Forms: multipart/form-data
+HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
+HTML 4.01 Specification, W3C Recommendation 24 December 1999
+
+
+Copyright 2002-2005 John J. Lee <jjl at pobox.com>
+Copyright 2005 Gary Poster
+Copyright 2005 Zope Corporation
+Copyright 1998-2000 Gisle Aas.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included with
+the distribution).
+
+"""
+
+# XXX
+# Add some more functional tests
+# Especially single and multiple file upload on the internet.
+# Does file upload work when name is missing? Sourceforge tracker form
+# doesn't like it. Check standards, and test with Apache. Test
+# binary upload with Apache.
+# There have been reports that some servers are very picky about MIME
+# boundaries, so file uploads may fail with those servers. Should
+# copy what IE does religiously.
+# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py.
+# Controls can have name=None (e.g. forms constructed partly with
+# JavaScript), but find_control can't be told to find a control
+# with that name, because None there means 'unspecified'. Can still
+# get at by nr, but would be nice to be able to specify something
+# equivalent to name=None, too.
+# Deal with character sets properly. Not sure what the issues are here.
+# Do URL encodings need any attention?
+# I don't *think* any encoding of control names, filenames or data is
+# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
+# doesn't seem to do it.
+# Add charset parameter to Content-type headers? How to find value??
+# mailto submission & enctype text/plain
+# I'm not going to fix this unless somebody tells me what real servers
+# that want this encoding actually expect: If enctype is
+# application/x-www-form-urlencoded and there's a FILE control present.
+# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
+# 17.13.2), but I send "name=" ATM. What about multiple file upload??
+# Get rid of MimeWriter.
+# Should really use sgmllib, not htmllib.
+
+# Would be nice, but I'm not going to do it myself:
+# -------------------------------------------------
+# Maybe a 0.3.x?
+# Replace by_label etc. with moniker / selector concept. Allows, eg.,
+# a choice between selection by value / id / label / element
+# contents. Or choice between matching labels exactly or by
+# substring. Etc.
+# Remove deprecated methods.
+# action should probably be an absolute URI, like DOMForm.
+# ...what else?
+# Work on DOMForm.
+# XForms? Don't know if there's a need here.
+
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+try: bool
+except NameError:
+ def bool(expr):
+ if expr: return True
+ else: return False
+
+import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
+ htmlentitydefs, re
+from urlparse import urljoin
+from cStringIO import StringIO
+
+try:
+ import warnings
+except ImportError:
+ def deprecation(message):
+ pass
+else:
+ def deprecation(message):
+ warnings.warn(message, DeprecationWarning, stacklevel=2)
+
+VERSION = "0.2.1a"
+
+CHUNK = 1024 # size of chunks fed to parser, in bytes
+
+_compress_re = re.compile(r"\s+")
+def compress_text(text): return _compress_re.sub(" ", text.strip())
+
+# This version of urlencode is from my Python 1.5.2 back-port of the
+# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
+# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
+def urlencode(query,doseq=False,):
+ """Encode a sequence of two-element tuples or dictionary into a URL query \
+string.
+
+ If any values in the query arg are sequences and doseq is true, each
+ sequence element is converted to a separate parameter.
+
+ If the query arg is a sequence of two-element tuples, the order of the
+ parameters in the output will match the order of parameters in the
+ input.
+ """
+
+ if hasattr(query,"items"):
+ # mapping objects
+ query = query.items()
+ else:
+ # it's a bother at times that strings and string-like objects are
+ # sequences...
+ try:
+ # non-sequence items should not work with len()
+ x = len(query)
+ # non-empty strings will fail this
+ if len(query) and type(query[0]) != types.TupleType:
+ raise TypeError()
+ # zero-length sequences of all types will get here and succeed,
+ # but that's a minor nit - since the original implementation
+ # allowed empty dicts that type of behavior probably should be
+ # preserved for consistency
+ except TypeError:
+ ty,va,tb = sys.exc_info()
+ raise TypeError("not a valid non-string sequence or mapping "
+ "object", tb)
+
+ l = []
+ if not doseq:
+ # preserve old behavior
+ for k, v in query:
+ k = urllib.quote_plus(str(k))
+ v = urllib.quote_plus(str(v))
+ l.append(k + '=' + v)
+ else:
+ for k, v in query:
+ k = urllib.quote_plus(str(k))
+ if type(v) == types.StringType:
+ v = urllib.quote_plus(v)
+ l.append(k + '=' + v)
+ elif type(v) == types.UnicodeType:
+ # is there a reasonable way to convert to ASCII?
+ # encode generates a string, but "replace" or "ignore"
+ # lose information and "strict" can raise UnicodeError
+ v = urllib.quote_plus(v.encode("ASCII","replace"))
+ l.append(k + '=' + v)
+ else:
+ try:
+ # is this a sufficient test for sequence-ness?
+ x = len(v)
+ except TypeError:
+ # not a sequence
+ v = urllib.quote_plus(str(v))
+ l.append(k + '=' + v)
+ else:
+ # loop over the sequence
+ for elt in v:
+ l.append(k + '=' + urllib.quote_plus(str(elt)))
+ return '&'.join(l)
+
+def unescape(data, entities):
+ if data is None or '&' not in data:
+ return data
+ def replace_entities(match, entities=entities):
+ ent = match.group()
+ repl = entities.get(ent, ent)
+ return repl
+ return re.sub(r'&\S+?;', replace_entities, data)
+
+def issequence(x):
+ try:
+ x[0]
+ except (TypeError, KeyError):
+ return False
+ except IndexError:
+ pass
+ return True
+
+def isstringlike(x):
+ try: x+""
+ except: return False
+ else: return True
+
+
+# XXX don't really want to drag this along (MimeWriter, choose_boundary)
+
+# --------------------------------------------------------------------
+# grabbed from Python standard library mimetools module and tweaked to
+# avoid socket.gaierror and to avoid dots ('.') in MIME boundaries
+try:
+ import thread
+ _thread = thread; del thread
+except ImportError:
+ import dummy_thread
+ _thread = dummy_thread; del dummy_thread
+_counter_lock = _thread.allocate_lock()
+del _thread
+
+_counter = 0
+def _get_next_counter():
+ global _counter
+ _counter_lock.acquire()
+ _counter = _counter + 1
+ result = _counter
+ _counter_lock.release()
+ return result
+
+_prefix = None
+
+def choose_boundary():
+ """Return a string usable as a multipart boundary.
+
+ The string chosen is unique within a single program run, and
+ incorporates the user id (if available), process id (if available),
+ and current time. So it's very unlikely the returned string appears
+ in message text, but there's no guarantee.
+
+ The boundary contains dots so you have to quote it in the header."""
+
+ global _prefix
+ import time
+ import os
+ import socket
+ if _prefix is None:
+ try:
+ socket.gaierror
+ except AttributeError:
+ exc = socket.error
+ else:
+ exc = socket.gaierror
+
+ try:
+ hostid = socket.gethostbyname(socket.gethostname())
+ except exc:
+ hostid = 'localhost'
+ try:
+ uid = repr(os.getuid())
+ except AttributeError:
+ uid = '1'
+ try:
+ pid = repr(os.getpid())
+ except AttributeError:
+ pid = '1'
+ _prefix = hostid + uid + pid
+ return "%s%d%d" % (_prefix, long(time.time()*100), _get_next_counter())
+
+# end of code from mimetools module
+# --------------------------------------------------------------------
+
+# This cut-n-pasted MimeWriter from standard library is here so can add
+# to HTTP headers rather than message body when appropriate. It also uses
+# \r\n in place of \n. This is nasty.
+class MimeWriter:
+
+ """Generic MIME writer.
+
+ Methods:
+
+ __init__()
+ addheader()
+ flushheaders()
+ startbody()
+ startmultipartbody()
+ nextpart()
+ lastpart()
+
+ A MIME writer is much more primitive than a MIME parser. It
+ doesn't seek around on the output file, and it doesn't use large
+ amounts of buffer space, so you have to write the parts in the
+ order they should occur on the output file. It does buffer the
+ headers you add, allowing you to rearrange their order.
+
+ General usage is:
+
+ f = <open the output file>
+ w = MimeWriter(f)
+ ...call w.addheader(key, value) 0 or more times...
+
+ followed by either:
+
+ f = w.startbody(content_type)
+ ...call f.write(data) for body data...
+
+ or:
+
+ w.startmultipartbody(subtype)
+ for each part:
+ subwriter = w.nextpart()
+ ...use the subwriter's methods to create the subpart...
+ w.lastpart()
+
+ The subwriter is another MimeWriter instance, and should be
+ treated in the same way as the toplevel MimeWriter. This way,
+ writing recursive body parts is easy.
+
+ Warning: don't forget to call lastpart()!
+
+ XXX There should be more state so calls made in the wrong order
+ are detected.
+
+ Some special cases:
+
+ - startbody() just returns the file passed to the constructor;
+ but don't use this knowledge, as it may be changed.
+
+ - startmultipartbody() actually returns a file as well;
+ this can be used to write the initial 'if you can read this your
+ mailer is not MIME-aware' message.
+
+ - If you call flushheaders(), the headers accumulated so far are
+ written out (and forgotten); this is useful if you don't need a
+ body part at all, e.g. for a subpart of type message/rfc822
+ that's (mis)used to store some header-like information.
+
+ - Passing a keyword argument 'prefix=<flag>' to addheader(),
+ start*body() affects where the header is inserted; 0 means
+ append at the end, 1 means insert at the start; default is
+ append for addheader(), but insert for start*body(), which use
+ it to determine where the Content-type header goes.
+
+ """
+
+ def __init__(self, fp, http_hdrs=None):
+ self._http_hdrs = http_hdrs
+ self._fp = fp
+ self._headers = []
+ self._boundary = []
+ self._first_part = True
+
+ def addheader(self, key, value, prefix=0,
+ add_to_http_hdrs=0):
+ """
+ prefix is ignored if add_to_http_hdrs is true.
+ """
+ lines = value.split("\r\n")
+ while lines and not lines[-1]: del lines[-1]
+ while lines and not lines[0]: del lines[0]
+ if add_to_http_hdrs:
+ value = "".join(lines)
+ self._http_hdrs.append((key, value))
+ else:
+ for i in range(1, len(lines)):
+ lines[i] = " " + lines[i].strip()
+ value = "\r\n".join(lines) + "\r\n"
+ line = key + ": " + value
+ if prefix:
+ self._headers.insert(0, line)
+ else:
+ self._headers.append(line)
+
+ def flushheaders(self):
+ self._fp.writelines(self._headers)
+ self._headers = []
+
+ def startbody(self, ctype=None, plist=[], prefix=1,
+ add_to_http_hdrs=0, content_type=1):
+ """
+ prefix is ignored if add_to_http_hdrs is true.
+ """
+ if content_type and ctype:
+ for name, value in plist:
+ ctype = ctype + ';\r\n %s=%s' % (name, value)
+ self.addheader("Content-type", ctype, prefix=prefix,
+ add_to_http_hdrs=add_to_http_hdrs)
+ self.flushheaders()
+ if not add_to_http_hdrs: self._fp.write("\r\n")
+ self._first_part = True
+ return self._fp
+
+ def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
+ add_to_http_hdrs=0, content_type=1):
+ boundary = boundary or choose_boundary()
+ self._boundary.append(boundary)
+ return self.startbody("multipart/" + subtype,
+ [("boundary", boundary)] + plist,
+ prefix=prefix,
+ add_to_http_hdrs=add_to_http_hdrs,
+ content_type=content_type)
+
+ def nextpart(self):
+ boundary = self._boundary[-1]
+ if self._first_part:
+ self._first_part = False
+ else:
+ self._fp.write("\r\n")
+ self._fp.write("--" + boundary + "\r\n")
+ return self.__class__(self._fp)
+
+ def lastpart(self):
+ if self._first_part:
+ self.nextpart()
+ boundary = self._boundary.pop()
+ self._fp.write("\r\n--" + boundary + "--\r\n")
+
+
+class LocateError(ValueError): pass
+class AmbiguityError(LocateError): pass
+class ControlNotFoundError(LocateError): pass
+class ItemNotFoundError(LocateError): pass
+
+class ItemCountError(ValueError): pass
+
+
+class ParseError(Exception): pass
+
+
+class _AbstractFormParser:
+ """forms attribute contains HTMLForm instances on completion."""
+ # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
+ def __init__(self, entitydefs=None):
+ if entitydefs is None:
+ entitydefs = get_entitydefs()
+ self._entitydefs = entitydefs
+
+ self.base = None
+ self.forms = []
+ self.labels = []
+ self._current_label = None
+ self._current_form = None
+ self._select = None
+ self._optgroup = None
+ self._option = None
+ self._textarea = None
+
+ def do_base(self, attrs):
+ for key, value in attrs:
+ if key == "href":
+ self.base = value
+
+ def end_body(self):
+ if self._current_label is not None:
+ self.end_label()
+ if self._current_form is not None:
+ self.end_form()
+
+ def start_form(self, attrs):
+ if self._current_form is not None:
+ raise ParseError("nested FORMs")
+ name = None
+ action = None
+ enctype = "application/x-www-form-urlencoded"
+ method = "GET"
+ d = {}
+ for key, value in attrs:
+ if key == "name":
+ name = value
+ elif key == "action":
+ action = value
+ elif key == "method":
+ method = value.upper()
+ elif key == "enctype":
+ enctype = value.lower()
+ d[key] = value
+ controls = []
+ self._current_form = (name, action, method, enctype), d, controls
+
+ def end_form(self):
+ if self._current_label is not None:
+ self.end_label()
+ if self._current_form is None:
+ raise ParseError("end of FORM before start")
+ self.forms.append(self._current_form)
+ self._current_form = None
+
+ def start_select(self, attrs):
+ if self._current_form is None:
+ raise ParseError("start of SELECT before start of FORM")
+ if self._select is not None:
+ raise ParseError("nested SELECTs")
+ if self._textarea is not None:
+ raise ParseError("SELECT inside TEXTAREA")
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+
+ self._select = d
+ self._add_label(d)
+
+ self._append_select_control({"__select": d})
+
+ def end_select(self):
+ if self._current_form is None:
+ raise ParseError("end of SELECT before start of FORM")
+ if self._select is None:
+ raise ParseError("end of SELECT before start")
+
+ if self._option is not None:
+ self._end_option()
+
+ self._select = None
+
+ def start_optgroup(self, attrs):
+ if self._select is None:
+ raise ParseError("OPTGROUP outside of SELECT")
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+
+ self._optgroup = d
+
+ def end_optgroup(self):
+ if self._optgroup is None:
+ raise ParseError("end of OPTGROUP before start")
+ self._optgroup = None
+
+ def _start_option(self, attrs):
+ if self._select is None:
+ raise ParseError("OPTION outside of SELECT")
+ if self._option is not None:
+ self._end_option()
+
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+
+ self._option = {}
+ self._option.update(d)
+ if (self._optgroup and self._optgroup.has_key("disabled") and
+ not self._option.has_key("disabled")):
+ self._option["disabled"] = None
+
+ def _end_option(self):
+ if self._option is None:
+ raise ParseError("end of OPTION before start")
+
+ contents = self._option.get("contents", "").strip()
+ self._option["contents"] = contents
+ if not self._option.has_key("value"):
+ self._option["value"] = contents
+ if not self._option.has_key("label"):
+ self._option["label"] = contents
+ # stuff dict of SELECT HTML attrs into a special private key
+ # (gets deleted again later)
+ self._option["__select"] = self._select
+ self._append_select_control(self._option)
+ self._option = None
+
+ def _append_select_control(self, attrs):
+ controls = self._current_form[2]
+ name = self._select.get("name")
+ controls.append(("select", name, attrs))
+
+ def start_textarea(self, attrs):
+ if self._current_form is None:
+ raise ParseError("start of TEXTAREA before start of FORM")
+ if self._textarea is not None:
+ raise ParseError("nested TEXTAREAs")
+ if self._select is not None:
+ raise ParseError("TEXTAREA inside SELECT")
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+ self._add_label(d)
+
+ self._textarea = d
+
+ def end_textarea(self):
+ if self._current_form is None:
+ raise ParseError("end of TEXTAREA before start of FORM")
+ if self._textarea is None:
+ raise ParseError("end of TEXTAREA before start")
+ controls = self._current_form[2]
+ name = self._textarea.get("name")
+ controls.append(("textarea", name, self._textarea))
+ self._textarea = None
+
+ def start_label(self, attrs):
+ if self._current_label:
+ self.end_label()
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+ taken = bool(d.get("for")) # empty id is invalid
+ d["__text"] = ""
+ d["__taken"] = taken
+ if taken:
+ self.labels.append(d)
+ self._current_label = d
+
+ def end_label(self):
+ label = self._current_label
+ if label is None:
+ # something is ugly in the HTML, but we're ignoring it
+ return
+ self._current_label = None
+ label["__text"] = label["__text"]
+ # if it is staying around, it is True in all cases
+ del label["__taken"]
+
+ def _add_label(self, d):
+ if self._current_label is not None:
+ if self._current_label["__taken"]:
+ self.end_label() # be fuzzy
+ else:
+ self._current_label["__taken"] = True
+ d["__label"] = self._current_label
+
+ def handle_data(self, data):
+ if self._option is not None:
+ # self._option is a dictionary of the OPTION element's HTML
+ # attributes, but it has two special keys, one of which is the
+ # special "contents" key contains text between OPTION tags (the
+ # other is the "__select" key: see the end_option method)
+ map = self._option
+ key = "contents"
+ elif self._textarea is not None:
+ map = self._textarea
+ key = "value"
+ # not if within option or textarea
+ elif self._current_label is not None:
+ map = self._current_label
+ key = "__text"
+ else:
+ return
+
+ if not map.has_key(key):
+ map[key] = data
+ else:
+ map[key] = map[key] + data
+
+ def do_button(self, attrs):
+ if self._current_form is None:
+ raise ParseError("start of BUTTON before start of FORM")
+ d = {}
+ d["type"] = "submit" # default
+ for key, val in attrs:
+ d[key] = val
+ controls = self._current_form[2]
+
+ type = d["type"]
+ name = d.get("name")
+ # we don't want to lose information, so use a type string that
+ # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
+ # e.g. type for BUTTON/RESET is "resetbutton"
+ # (type for INPUT/RESET is "reset")
+ type = type+"button"
+ self._add_label(d)
+ controls.append((type, name, d))
+
+ def do_input(self, attrs):
+ if self._current_form is None:
+ raise ParseError("start of INPUT before start of FORM")
+ d = {}
+ d["type"] = "text" # default
+ for key, val in attrs:
+ d[key] = val
+ controls = self._current_form[2]
+
+ type = d["type"]
+ name = d.get("name")
+ self._add_label(d)
+ controls.append((type, name, d))
+
+ def do_isindex(self, attrs):
+ if self._current_form is None:
+ raise ParseError("start of ISINDEX before start of FORM")
+ d = {}
+ for key, val in attrs:
+ d[key] = val
+ controls = self._current_form[2]
+
+ self._add_label(d)
+ # isindex doesn't have type or name HTML attributes
+ controls.append(("isindex", None, d))
+
+ def handle_entityref(self, name):
+ table = self._entitydefs
+ fullname = "&%s;" % name
+ if table.has_key(fullname):
+ self.handle_data(table[fullname])
+ else:
+ self.unknown_entityref(name)
+ return
+
+ def unescape_attr(self, name):
+ return unescape(name, self._entitydefs)
+
+ def unescape_attrs(self, attrs):
+ escaped_attrs = {}
+ for key, val in attrs.items():
+ try:
+ val.items
+ except AttributeError:
+ escaped_attrs[key] = self.unescape_attr(val)
+ else:
+ # e.g. "__select" -- yuck!
+ escaped_attrs[key] = self.unescape_attrs(val)
+ return escaped_attrs
+
+ def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
+ def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
+
+
+# HTMLParser.HTMLParser is recent, so live without it if it's not available
+# (also, htmllib.HTMLParser is much more tolerant of bad HTML)
+try:
+ import HTMLParser
+except ImportError:
+ class XHTMLCompatibleFormParser:
+ def __init__(self, entitydefs=None):
+ raise ValueError("HTMLParser could not be imported")
+else:
+ class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
+ """Good for XHTML, bad for tolerance of incorrect HTML."""
+ # thanks to Michael Howitz for this!
+ def __init__(self, entitydefs=None):
+ HTMLParser.HTMLParser.__init__(self)
+ _AbstractFormParser.__init__(self, entitydefs)
+
+ def start_option(self, attrs):
+ _AbstractFormParser._start_option(self, attrs)
+
+ def end_option(self):
+ _AbstractFormParser._end_option(self)
+
+ def handle_starttag(self, tag, attrs):
+ try:
+ method = getattr(self, "start_" + tag)
+ except AttributeError:
+ try:
+ method = getattr(self, "do_" + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method(attrs)
+ else:
+ method(attrs)
+
+ def handle_endtag(self, tag):
+ try:
+ method = getattr(self, "end_" + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method()
+
+ # taken from sgmllib, with changes
+ def handle_charref(self, name):
+ try:
+ n = int(name)
+ except ValueError:
+ self.unknown_charref(name)
+ return
+ if not 0 <= n <= 255:
+ self.unknown_charref(name)
+ return
+ self.handle_data(chr(n))
+
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+ def unescape_attr_if_required(self, name):
+ return name # HTMLParser.HTMLParser already did it
+ def unescape_attrs_if_required(self, attrs):
+ return attrs # ditto
+
+import htmllib, formatter
+class FormParser(_AbstractFormParser, htmllib.HTMLParser):
+ """Good for tolerance of incorrect HTML, bad for XHTML."""
+ def __init__(self, entitydefs=None):
+ htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
+ _AbstractFormParser.__init__(self, entitydefs)
+
+ def do_option(self, attrs):
+ _AbstractFormParser._start_option(self, attrs)
+
+ def unescape_attr_if_required(self, name):
+ return self.unescape_attr(name)
+ def unescape_attrs_if_required(self, attrs):
+ return self.unescape_attrs(attrs)
+
+#FormParser = XHTMLCompatibleFormParser # testing hack
+
+def get_entitydefs():
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ entitydefs["&%s;" % name] = char
+ return entitydefs
+
+def ParseResponse(response, select_default=False,
+ ignore_errors=False, # ignored!
+ form_parser_class=FormParser,
+ request_class=urllib2.Request,
+ entitydefs=None, backwards_compat=True):
+ """Parse HTTP response and return a list of HTMLForm instances.
+
+ The return value of urllib2.urlopen can be conveniently passed to this
+ function as the response parameter.
+
+ ClientForm.ParseError is raised on parse errors.
+
+ response: file-like object (supporting read() method) with a method
+ geturl(), returning the URI of the HTTP response
+ select_default: for multiple-selection SELECT controls and RADIO controls,
+ pick the first item as the default if none are selected in the HTML
+ form_parser_class: class to instantiate and use to pass
+ request_class: class to return from .click() method (default is
+ urllib2.Request)
+ entitydefs: mapping like {'&': '&', ...} containing HTML entity
+ definitions (a sensible default is used)
+
+ backwards_compat: boolean that determines whether the returned HTMLForm
+ objects are backwards-compatible with old code. If backwards_compat is True:
+
+ - ClientForm 0.1 code will continue to work as before.
+
+ - Label searches that do not specify a nr (number or count) will always
+ get the first match, even if other controls match. If
+ backwards_compat is False, label searches that have ambiguous results
+ will raise an AmbiguityError.
+
+ - Item label matching is done by strict string comparison rather than
+ substring matching.
+
+ - De-selecting individual list items is allowed even if the Item is
+ disabled.
+
+ The backwards_compat argument will be deprecated in a future release.
+
+ Pass a true value for select_default if you want the behaviour specified by
+ RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
+ RADIO or multiple-selection SELECT control if none were selected in the
+ HTML. Most browsers (including Microsoft Internet Explorer (IE) and
+ Netscape Navigator) instead leave all items unselected in these cases. The
+ W3C HTML 4.0 standard leaves this behaviour undefined in the case of
+ multiple-selection SELECT controls, but insists that at least one RADIO
+ button should be checked at all times, in contradiction to browser
+ behaviour.
+
+ There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
+ HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
+ htmllib.HTMLParser) (the default) works best for ordinary grubby HTML.
+ Note that HTMLParser is only available in Python 2.2 and later. You can
+ pass your own class in here as a hack to work around bad HTML, but at your
+ own risk: there is no well-defined interface.
+
+ """
+ return ParseFile(response, response.geturl(), select_default,
+ False,
+ form_parser_class,
+ request_class,
+ entitydefs, backwards_compat)
+
+def ParseFile(file, base_uri, select_default=False,
+ ignore_errors=False, # ignored!
+ form_parser_class=FormParser,
+ request_class=urllib2.Request,
+ entitydefs=None, backwards_compat=True):
+ """Parse HTML and return a list of HTMLForm instances.
+
+ ClientForm.ParseError is raised on parse errors.
+
+ file: file-like object (supporting read() method) containing HTML with zero
+ or more forms to be parsed
+ base_uri: the URI of the document (note that the base URI used to submit
+ the form will be that given in the BASE element if present, not that of
+ the document)
+
+ For the other arguments and further details, see ParseResponse.__doc__.
+
+ """
+ if backwards_compat:
+ deprecation("operating in backwards-compatibility mode")
+ fp = form_parser_class(entitydefs)
+ while 1:
+ data = file.read(CHUNK)
+ try:
+ fp.feed(data)
+ except ParseError, e:
+ e.base_uri = base_uri
+ raise
+ if len(data) != CHUNK: break
+ if fp.base is not None:
+ # HTML BASE element takes precedence over document URI
+ base_uri = fp.base
+ labels = [] # Label(label) for label in fp.labels]
+ id_to_labels = {}
+ for l in fp.labels:
+ label = Label(l)
+ labels.append(label)
+ for_id = l["for"]
+ coll = id_to_labels.get(for_id)
+ if coll is None:
+ id_to_labels[for_id] = [label]
+ else:
+ coll.append(label)
+ forms = []
+ for (name, action, method, enctype), attrs, controls in fp.forms:
+ if action is None:
+ action = base_uri
+ else:
+ action = urljoin(base_uri, action)
+ action = fp.unescape_attr_if_required(action)
+ name = fp.unescape_attr_if_required(name)
+ attrs = fp.unescape_attrs_if_required(attrs)
+ # would be nice to make HTMLForm class (form builder) pluggable
+ form = HTMLForm(
+ action, method, enctype, name, attrs, request_class,
+ forms, labels, id_to_labels, backwards_compat)
+ for ii in range(len(controls)):
+ type, name, attrs = controls[ii]
+ attrs = fp.unescape_attrs_if_required(attrs)
+ name = fp.unescape_attr_if_required(name)
+ form.new_control(type, name, attrs, select_default=select_default,
+ index=ii)
+ forms.append(form)
+ for form in forms:
+ form.fixup()
+ return forms
+
+
+class Label:
+ def __init__(self, attrs):
+ self.id = attrs.get("for")
+ self._text = attrs.get("__text").strip()
+ self._ctext = compress_text(self._text)
+ self.attrs = attrs
+ self._backwards_compat = False # maintaned by HTMLForm
+
+ def __getattr__(self, name):
+ if name == "text":
+ if self._backwards_compat:
+ return self._text
+ else:
+ return self._ctext
+ return getattr(Label, name)
+
+ def __setattr__(self, name, value):
+ if name == "text":
+ # don't see any need for this
+ raise AttributeError("text attribute is read-only")
+ self.__dict__[name] = value
+
+ def __str__(self):
+ return '<Label(id=%r, text=%r)>' % (self.id, self.text)
+
+
+def _getLabel(attrs):
+ text = attrs.get("__label")
+ if text is not None:
+ return Label(text)
+ else:
+ return None
+
+class Control:
+ """An HTML form control.
+
+ An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
+ are accessed using the HTMLForm.find_control method or the
+ HTMLForm.controls attribute.
+
+ Control instances are usually constructed using the ParseFile /
+ ParseResponse functions. If you use those functions, you can ignore the
+ rest of this paragraph. A Control is only properly initialised after the
+ fixup method has been called. In fact, this is only strictly necessary for
+ ListControl instances. This is necessary because ListControls are built up
+ from ListControls each containing only a single item, and their initial
+ value(s) can only be known after the sequence is complete.
+
+ The types and values that are acceptable for assignment to the value
+ attribute are defined by subclasses.
+
+ If the disabled attribute is true, this represents the state typically
+ represented by browsers by 'greying out' a control. If the disabled
+ attribute is true, the Control will raise AttributeError if an attempt is
+ made to change its value. In addition, the control will not be considered
+ 'successful' as defined by the W3C HTML 4 standard -- ie. it will
+ contribute no data to the return value of the HTMLForm.click* methods. To
+ enable a control, set the disabled attribute to a false value.
+
+ If the readonly attribute is true, the Control will raise AttributeError if
+ an attempt is made to change its value. To make a control writable, set
+ the readonly attribute to a false value.
+
+ All controls have the disabled and readonly attributes, not only those that
+ may have the HTML attributes of the same names.
+
+ On assignment to the value attribute, the following exceptions are raised:
+ TypeError, AttributeError (if the value attribute should not be assigned
+ to, because the control is disabled, for example) and ValueError.
+
+ If the name or value attributes are None, or the value is an empty list, or
+ if the control is disabled, the control is not successful.
+
+ Public attributes:
+
+ type: string describing type of control (see the keys of the
+ HTMLForm.type2class dictionary for the allowable values) (readonly)
+ name: name of control (readonly)
+ value: current value of control (subclasses may allow a single value, a
+ sequence of values, or either)
+ disabled: disabled state
+ readonly: readonly state
+ id: value of id HTML attribute
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ """
+ type: string describing type of control (see the keys of the
+ HTMLForm.type2class dictionary for the allowable values)
+ name: control name
+ attrs: HTML attributes of control's HTML element
+
+ """
+ raise NotImplementedError()
+
+ def add_to_form(self, form):
+ self._form = form
+ form.controls.append(self)
+
+ def fixup(self):
+ pass
+
+ def is_of_kind(self, kind):
+ raise NotImplementedError()
+
+ def clear(self):
+ raise NotImplementedError()
+
+ def __getattr__(self, name): raise NotImplementedError()
+ def __setattr__(self, name, value): raise NotImplementedError()
+
+ def pairs(self):
+ """Return list of (key, value) pairs suitable for passing to urlencode.
+ """
+ return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
+
+ def _totally_ordered_pairs(self):
+ """Return list of (key, value, index) tuples.
+
+ Like pairs, but allows preserving correct ordering even where several
+ controls are involved.
+
+ """
+ raise NotImplementedError()
+
+ def _write_mime_data(self, mw):
+ """Write data for this control to a MimeWriter."""
+ # called by HTMLForm
+ for name, value in self.pairs():
+ mw2 = mw.nextpart()
+ mw2.addheader("Content-disposition",
+ 'form-data; name="%s"' % name, 1)
+ f = mw2.startbody(prefix=0)
+ f.write(value)
+
+ def __str__(self):
+ raise NotImplementedError()
+
+ def get_labels(self):
+ """Return all labels (Label instances) for this control.
+
+ If the control was surrounded by a <label> tag, that will be the first
+ label; all other labels, connected by 'for' and 'id', are in the order
+ that appear in the HTML.
+
+ """
+ res = []
+ if self._label:
+ res.append(self._label)
+ if self.id:
+ res.extend(self._form._id_to_labels.get(self.id, ()))
+ return res
+
+
+#---------------------------------------------------
+class ScalarControl(Control):
+ """Control whose value is not restricted to one of a prescribed set.
+
+ Some ScalarControls don't accept any value attribute. Otherwise, takes a
+ single value, which must be string-like.
+
+ Additional read-only public attribute:
+
+ attrs: dictionary mapping the names of original HTML attributes of the
+ control to their values
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ self._index = index
+ self._label = _getLabel(attrs)
+ self.__dict__["type"] = type.lower()
+ self.__dict__["name"] = name
+ self._value = attrs.get("value")
+ self.disabled = attrs.has_key("disabled")
+ self.readonly = attrs.has_key("readonly")
+ self.id = attrs.get("id")
+
+ self.attrs = attrs.copy()
+
+ self._clicked = False
+
+ def __getattr__(self, name):
+ if name == "value":
+ return self.__dict__["_value"]
+ else:
+ raise AttributeError("%s instance has no attribute '%s'" %
+ (self.__class__.__name__, name))
+
+ def __setattr__(self, name, value):
+ if name == "value":
+ if not isstringlike(value):
+ raise TypeError("must assign a string")
+ elif self.readonly:
+ raise AttributeError("control '%s' is readonly" % self.name)
+ elif self.disabled:
+ raise AttributeError("control '%s' is disabled" % self.name)
+ self.__dict__["_value"] = value
+ elif name in ("name", "type"):
+ raise AttributeError("%s attribute is readonly" % name)
+ else:
+ self.__dict__[name] = value
+
+ def _totally_ordered_pairs(self):
+ name = self.name
+ value = self.value
+ if name is None or value is None or self.disabled:
+ return []
+ return [(self._index, name, value)]
+
+ def clear(self):
+ if self.readonly:
+ raise AttributeError("control '%s' is readonly" % self.name)
+ self.__dict__["_value"] = None
+
+ def __str__(self):
+ name = self.name
+ value = self.value
+ if name is None: name = "<None>"
+ if value is None: value = "<None>"
+
+ infos = []
+ if self.disabled: infos.append("disabled")
+ if self.readonly: infos.append("readonly")
+ info = ", ".join(infos)
+ if info: info = " (%s)" % info
+
+ return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
+
+
+#---------------------------------------------------
+class TextControl(ScalarControl):
+ """Textual input control.
+
+ Covers:
+
+ INPUT/TEXT
+ INPUT/PASSWORD
+ INPUT/FILE
+ INPUT/HIDDEN
+ TEXTAREA
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ ScalarControl.__init__(self, type, name, attrs, index)
+ if self.type == "hidden": self.readonly = True
+ if self._value is None:
+ self._value = ""
+
+ def is_of_kind(self, kind): return kind == "text"
+
+#---------------------------------------------------
+class FileControl(ScalarControl):
+ """File upload with INPUT TYPE=FILE.
+
+ The value attribute of a FileControl is always None. Use add_file instead.
+
+ Additional public method: add_file
+
+ """
+
+ def __init__(self, type, name, attrs, index=None):
+ ScalarControl.__init__(self, type, name, attrs, index)
+ self._value = None
+ self._upload_data = []
+
+ def is_of_kind(self, kind): return kind == "file"
+
+ def clear(self):
+ if self.readonly:
+ raise AttributeError("control '%s' is readonly" % self.name)
+ self._upload_data = []
+
+ def __setattr__(self, name, value):
+ if name in ("value", "name", "type"):
+ raise AttributeError("%s attribute is readonly" % name)
+ else:
+ self.__dict__[name] = value
+
+ def add_file(self, file_object, content_type=None, filename=None):
+ if not hasattr(file_object, "read"):
+ raise TypeError("file-like object must have read method")
+ if content_type is not None and not isstringlike(content_type):
+ raise TypeError("content type must be None or string-like")
+ if filename is not None and not isstringlike(filename):
+ raise TypeError("filename must be None or string-like")
+ if content_type is None:
+ content_type = "application/octet-stream"
+ self._upload_data.append((file_object, content_type, filename))
+
+ def _totally_ordered_pairs(self):
+ # XXX should it be successful even if unnamed?
+ if self.name is None or self.disabled:
+ return []
+ return [(self._index, self.name, "")]
+
+ def _write_mime_data(self, mw):
+ # called by HTMLForm
+ if len(self._upload_data) == 1:
+ # single file
+ file_object, content_type, filename = self._upload_data[0]
+ mw2 = mw.nextpart()
+ fn_part = filename and ('; filename="%s"' % filename) or ""
+ disp = 'form-data; name="%s"%s' % (self.name, fn_part)
+ mw2.addheader("Content-disposition", disp, prefix=1)
+ fh = mw2.startbody(content_type, prefix=0)
+ fh.write(file_object.read())
+ elif len(self._upload_data) != 0:
+ # multiple files
+ mw2 = mw.nextpart()
+ disp = 'form-data; name="%s"' % self.name
+ mw2.addheader("Content-disposition", disp, prefix=1)
+ fh = mw2.startmultipartbody("mixed", prefix=0)
+ for file_object, content_type, filename in self._upload_data:
+ mw3 = mw2.nextpart()
+ fn_part = filename and ('; filename="%s"' % filename) or ""
+ disp = "file%s" % fn_part
+ mw3.addheader("Content-disposition", disp, prefix=1)
+ fh2 = mw3.startbody(content_type, prefix=0)
+ fh2.write(file_object.read())
+ mw2.lastpart()
+
+ def __str__(self):
+ name = self.name
+ if name is None: name = "<None>"
+
+ if not self._upload_data:
+ value = "<No files added>"
+ else:
+ value = []
+ for file, ctype, filename in self._upload_data:
+ if filename is None:
+ value.append("<Unnamed file>")
+ else:
+ value.append(filename)
+ value = ", ".join(value)
+
+ info = []
+ if self.disabled: info.append("disabled")
+ if self.readonly: info.append("readonly")
+ info = ", ".join(info)
+ if info: info = " (%s)" % info
+
+ return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
+
+
+#---------------------------------------------------
+class IsindexControl(ScalarControl):
+ """ISINDEX control.
+
+ ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
+ part of regular HTML forms at all, and predates it. You're only allowed
+ one ISINDEX per HTML document. ISINDEX and regular form submission are
+ mutually exclusive -- either submit a form, or the ISINDEX.
+
+ Having said this, since ISINDEX controls may appear in forms (which is
+ probably bad HTML), ParseFile / ParseResponse will include them in the
+ HTMLForm instances it returns. You can set the ISINDEX's value, as with
+ any other control (but note that ISINDEX controls have no name, so you'll
+ need to use the type argument of set_value!). When you submit the form,
+ the ISINDEX will not be successful (ie., no data will get returned to the
+ server as a result of its presence), unless you click on the ISINDEX
+ control, in which case the ISINDEX gets submitted instead of the form:
+
+ form.set_value("my isindex value", type="isindex")
+ urllib2.urlopen(form.click(type="isindex"))
+
+ ISINDEX elements outside of FORMs are ignored. If you want to submit one
+ by hand, do it like so:
+
+ url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
+ result = urllib2.urlopen(url)
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ ScalarControl.__init__(self, type, name, attrs, index)
+ if self._value is None:
+ self._value = ""
+
+ def is_of_kind(self, kind): return kind in ["text", "clickable"]
+
+ def _totally_ordered_pairs(self):
+ return []
+
+ def _click(self, form, coord, return_type, request_class=urllib2.Request):
+ # Relative URL for ISINDEX submission: instead of "foo=bar+baz",
+ # want "bar+baz".
+ # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
+ # deprecated in 4.01, but it should still say how to submit it).
+ # Submission of ISINDEX is explained in the HTML 3.2 spec, though.
+ parts = urlparse.urlparse(form.action)
+ rest, (query, frag) = parts[:-2], parts[-2:]
+ parts = rest + (urllib.quote_plus(self.value), "")
+ url = urlparse.urlunparse(parts)
+ req_data = url, None, []
+
+ if return_type == "pairs":
+ return []
+ elif return_type == "request_data":
+ return req_data
+ else:
+ return request_class(url)
+
+ def __str__(self):
+ value = self.value
+ if value is None: value = "<None>"
+
+ infos = []
+ if self.disabled: infos.append("disabled")
+ if self.readonly: infos.append("readonly")
+ info = ", ".join(infos)
+ if info: info = " (%s)" % info
+
+ return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
+
+
+#---------------------------------------------------
+class IgnoreControl(ScalarControl):
+ """Control that we're not interested in.
+
+ Covers:
+
+ INPUT/RESET
+ BUTTON/RESET
+ INPUT/BUTTON
+ BUTTON/BUTTON
+
+ These controls are always unsuccessful, in the terminology of HTML 4 (ie.
+ they never require any information to be returned to the server).
+
+ BUTTON/BUTTON is used to generate events for script embedded in HTML.
+
+ The value attribute of IgnoreControl is always None.
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ ScalarControl.__init__(self, type, name, attrs, index)
+ self._value = None
+
+ def is_of_kind(self, kind): return False
+
+ def __setattr__(self, name, value):
+ if name == "value":
+ raise AttributeError(
+ "control '%s' is ignored, hence read-only" % self.name)
+ elif name in ("name", "type"):
+ raise AttributeError("%s attribute is readonly" % name)
+ else:
+ self.__dict__[name] = value
+
+
+#---------------------------------------------------
+# ListControls
+
+# helpers and subsidiary classes
+
+class Item:
+ def __init__(self, control, attrs, index=None):
+ label = _getLabel(attrs)
+ self.__dict__.update({
+ "name": attrs["value"],
+ "_labels": label and [label] or [],
+ "attrs": attrs,
+ "_control": control,
+ "disabled": attrs.has_key("disabled"),
+ "_selected": False,
+ "id": attrs.get("id"),
+ "_index": index,
+ })
+ control.items.append(self)
+
+ def get_labels(self):
+ """Return all labels (Label instances) for this item.
+
+ For items that represent radio buttons or checkboxes, if the item was
+ surrounded by a <label> tag, that will be the first label; all other
+ labels, connected by 'for' and 'id', are in the order that appear in
+ the HTML.
+
+ For items that represent select options, if the option had a label
+ attribute, that will be the first label. If the option has contents
+ (text within the option tags) and it is not the same as the label
+ attribute (if any), that will be a label. There is nothing in the
+ spec to my knowledge that makes an option with an id unable to be the
+ target of a label's for attribute, so those are included, if any, for
+ the sake of consistency and completeness.
+
+ """
+ res = []
+ res.extend(self._labels)
+ if self.id:
+ res.extend(self._control._form._id_to_labels.get(self.id, ()))
+ return res
+
+ def __getattr__(self, name):
+ if name=="selected":
+ return self._selected
+ raise AttributeError(name)
+
+ def __setattr__(self, name, value):
+ if name == "selected":
+ self._control._set_selected_state(self, value)
+ elif name == "disabled":
+ self.__dict__["disabled"] = bool(value)
+ else:
+ raise AttributeError(name)
+
+ def __str__(self):
+ res = self.name
+ if self.selected:
+ res = "*" + res
+ if self.disabled:
+ res = "(%s)" % res
+ return res
+
+ def __repr__(self):
+ attrs = [("name", self.name), ("id", self.id)]+self.attrs.items()
+ return "<%s %s>" % (
+ self.__class__.__name__,
+ " ".join(["%s=%r" % (k, v) for k, v in attrs])
+ )
+
+def disambiguate(items, nr, **kwds):
+ msgs = []
+ for key, value in kwds.items():
+ msgs.append("%s=%r" % (key, value))
+ msg = " ".join(msgs)
+ if not items:
+ raise ItemNotFoundError(msg)
+ if nr is None:
+ if len(items) > 1:
+ raise AmbiguityError(msg)
+ nr = 0
+ if len(items) <= nr:
+ raise ItemNotFoundError(msg)
+ return items[nr]
+
+class ListControl(Control):
+ """Control representing a sequence of items.
+
+ The value attribute of a ListControl represents the successful list items
+ in the control. The successful list items are those that are selected and
+ not disabled.
+
+ ListControl implements both list controls that take a length-1 value
+ (single-selection) and those that take length >1 values
+ (multiple-selection).
+
+ ListControls accept sequence values only. Some controls only accept
+ sequences of length 0 or 1 (RADIO, and single-selection SELECT).
+ In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
+ and multiple-selection SELECTs (those having the "multiple" HTML attribute)
+ accept sequences of any length.
+
+ Note the following mistake:
+
+ control.value = some_value
+ assert control.value == some_value # not necessarily true
+
+ The reason for this is that the value attribute always gives the list items
+ in the order they were listed in the HTML.
+
+ ListControl items can also be referred to by their labels instead of names.
+ Use the label argument to .get(), and the .set_value_by_label(),
+ .get_value_by_label() methods.
+
+ Note that, rather confusingly, though SELECT controls are represented in
+ HTML by SELECT elements (which contain OPTION elements, representing
+ individual list items), CHECKBOXes and RADIOs are not represented by *any*
+ element. Instead, those controls are represented by a collection of INPUT
+ elements. For example, this is a SELECT control, named "control1":
+
+ <select name="control1">
+ <option>foo</option>
+ <option value="1">bar</option>
+ </select>
+
+ and this is a CHECKBOX control, named "control2":
+
+ <input type="checkbox" name="control2" value="foo" id="cbe1">
+ <input type="checkbox" name="control2" value="bar" id="cbe2">
+
+ The id attribute of a CHECKBOX or RADIO ListControl is always that of its
+ first element (for example, "cbe1" above).
+
+
+ Additional read-only public attribute: multiple.
+
+ """
+
+ # ListControls are built up by the parser from their component items by
+ # creating one ListControl per item, consolidating them into a single
+ # master ListControl held by the HTMLForm:
+
+ # -User calls form.new_control(...)
+ # -Form creates Control, and calls control.add_to_form(self).
+ # -Control looks for a Control with the same name and type in the form,
+ # and if it finds one, merges itself with that control by calling
+ # control.merge_control(self). The first Control added to the form, of
+ # a particular name and type, is the only one that survives in the
+ # form.
+ # -Form calls control.fixup for all its controls. ListControls in the
+ # form know they can now safely pick their default values.
+
+ # To create a ListControl without an HTMLForm, use:
+
+ # control.merge_control(new_control)
+
+ # (actually, it's much easier just to use ParseFile)
+
+ _label = None
+
+ def __init__(self, type, name, attrs={}, select_default=False,
+ called_as_base_class=False, index=None):
+ """
+ select_default: for RADIO and multiple-selection SELECT controls, pick
+ the first item as the default if no 'selected' HTML attribute is
+ present
+
+ """
+ if not called_as_base_class:
+ raise NotImplementedError()
+
+ self.__dict__["type"] = type.lower()
+ self.__dict__["name"] = name
+ self._value = attrs.get("value")
+ self.disabled = False
+ self.readonly = False
+ self.id = attrs.get("id")
+
+ # As Controls are merged in with .merge_control(), self.attrs will
+ # refer to each Control in turn -- always the most recently merged
+ # control. Each merged-in Control instance corresponds to a single
+ # list item: see ListControl.__doc__.
+ self.items = []
+ self._form = None
+
+ self._select_default = select_default
+ self._clicked = False
+
+ def clear(self):
+ self.value = []
+
+ def is_of_kind(self, kind):
+ if kind == "list":
+ return True
+ elif kind == "multilist":
+ return bool(self.multiple)
+ elif kind == "singlelist":
+ return not self.multiple
+ else:
+ return False
+
+ def get_items(self, name=None, label=None, id=None,
+ exclude_disabled=False):
+ """Return matching items by name or label.
+
+ For argument docs, see the docstring for .get()
+
+ """
+ if name is not None and not isstringlike(name):
+ raise TypeError("item name must be string-like")
+ if label is not None and not isstringlike(label):
+ raise TypeError("item label must be string-like")
+ if id is not None and not isstringlike(id):
+ raise TypeError("item id must be string-like")
+ items = [] # order is important
+ compat = self._form.backwards_compat
+ for o in self.items:
+ if exclude_disabled and o.disabled:
+ continue
+ if name is not None and o.name != name:
+ continue
+ if label is not None:
+ for l in o.get_labels():
+ if ((compat and l.text == label) or
+ (not compat and l.text.find(label) > -1)):
+ break
+ else:
+ continue
+ if id is not None and o.id != id:
+ continue
+ items.append(o)
+ return items
+
+ def get(self, name=None, label=None, id=None, nr=None,
+ exclude_disabled=False):
+ """Return item by name or label, disambiguating if necessary with nr.
+
+ All arguments must be passed by name, with the exception of 'name',
+ which may be used as a positional argument.
+
+ If name is specified, then the item must have the indicated name.
+
+ If label is specified, then the item must have a label whose
+ whitespace-compressed, stripped, text substring-matches the indicated
+ label string (eg. label="please choose" will match
+ " Do please choose an item ").
+
+ If id is specified, then the item must have the indicated id.
+
+ nr is an optional 0-based index of the items matching the query.
+
+ If nr is the default None value and more than item is found, raises
+ AmbiguityError (unless the HTMLForm instance's backwards_compat
+ attribute is true).
+
+ If no item is found, or if items are found but nr is specified and not
+ found, raises ItemNotFoundError.
+
+ Optionally excludes disabled items.
+
+ """
+ if nr is None and self._form.backwards_compat:
+ nr = 0 # :-/
+ items = self.get_items(name, label, id, exclude_disabled)
+ return disambiguate(items, nr, name=name, label=label, id=id)
+
+ def _get(self, name, by_label=False, nr=None, exclude_disabled=False):
+ # strictly for use by deprecated methods
+ if by_label:
+ name, label = None, name
+ else:
+ name, label = name, None
+ return self.get(name, label, nr, exclude_disabled)
+
+ def toggle(self, name, by_label=False, nr=None):
+ """Deprecated: given a name or label and optional disambiguating index
+ nr, toggle the matching item's selection.
+
+ Selecting items follows the behavior described in the docstring of the
+ 'get' method.
+
+ if the item is disabled, or this control is disabled or readonly,
+ raise AttributeError.
+
+ """
+ deprecation(
+ "item = control.get(...); item.selected = not item.selected")
+ o = self._get(name, by_label, nr)
+ self._set_selected_state(o, not o.selected)
+
+ def set(self, selected, name, by_label=False, nr=None):
+ """Deprecated: given a name or label and optional disambiguating index
+ nr, set the matching item's selection to the bool value of selected.
+
+ Selecting items follows the behavior described in the docstring of the
+ 'get' method.
+
+ if the item is disabled, or this control is disabled or readonly,
+ raise AttributeError.
+
+ """
+ deprecation(
+ "control.get(...).selected = <boolean>")
+ self._set_selected_state(self._get(name, by_label, nr), selected)
+
+ def _set_selected_state(self, item, action):
+ # action:
+ # bool False: off
+ # bool True: on
+ if self.disabled:
+ raise AttributeError("control '%s' is disabled" % self.name)
+ if self.readonly:
+ raise AttributeError("control '%s' is readonly" % self.name)
+ action == bool(action)
+ compat = self._form.backwards_compat
+ if not compat and item.disabled:
+ raise AttributeError("item is disabled")
+ else:
+ if compat and item.disabled and action:
+ raise AttributeError("item is disabled")
+ if self.multiple:
+ item.__dict__["_selected"] = action
+ else:
+ if not action:
+ item.__dict__["_selected"] = False
+ else:
+ for o in self.items:
+ o.__dict__["_selected"] = False
+ item.__dict__["_selected"] = True
+
+ def toggle_single(self, by_label=None):
+ """Deprecated: toggle the selection of the single item in this control.
+
+ Raises ItemCountError if the control does not contain only one item.
+
+ by_label argument is ignored, and included only for backwards
+ compatibility.
+
+ """
+ deprecation(
+ "control.items[0].selected = not control.items[0].selected")
+ if len(self.items) != 1:
+ raise ItemCountError(
+ "'%s' is not a single-item control" % self.name)
+ item = self.items[0]
+ self._set_selected_state(item, not item.selected)
+
+ def set_single(self, selected, by_label=None):
+ """Deprecated: set the selection of the single item in this control.
+
+ Raises ItemCountError if the control does not contain only one item.
+
+ by_label argument is ignored, and included only for backwards
+ compatibility.
+
+ """
+ deprecation(
+ "control.items[0].selected = <boolean>")
+ if len(self.items) != 1:
+ raise ItemCountError(
+ "'%s' is not a single-item control" % self.name)
+ self._set_selected_state(self.items[0], selected)
+
+ def get_item_disabled(self, name, by_label=False, nr=None):
+ """Get disabled state of named list item in a ListControl."""
+ deprecation(
+ "control.get(...).disabled")
+ return self._get(name, by_label, nr).disabled
+
+ def set_item_disabled(self, disabled, name, by_label=False, nr=None):
+ """Set disabled state of named list item in a ListControl.
+
+ disabled: boolean disabled state
+
+ """
+ deprecation(
+ "control.get(...).disabled = <boolean>")
+ self._get(name, by_label, nr).disabled = disabled
+
+ def set_all_items_disabled(self, disabled):
+ """Set disabled state of all list items in a ListControl.
+
+ disabled: boolean disabled state
+
+ """
+ for o in self.items:
+ o.disabled = disabled
+
+ def get_item_attrs(self, name, by_label=False, nr=None):
+ """Return dictionary of HTML attributes for a single ListControl item.
+
+ The HTML element types that describe list items are: OPTION for SELECT
+ controls, INPUT for the rest. These elements have HTML attributes that
+ you may occasionally want to know about -- for example, the "alt" HTML
+ attribute gives a text string describing the item (graphical browsers
+ usually display this as a tooltip).
+
+ The returned dictionary maps HTML attribute names to values. The names
+ and values are taken from the original HTML.
+
+ """
+ deprecation(
+ "control.get(...).attrs")
+ return self._get(name, by_label, nr).attrs
+
+ def add_to_form(self, form):
+ assert self._form is None or form == self._form, (
+ "can't add control to more than one form")
+ self._form = form
+ try:
+ control = form.find_control(self.name, self.type)
+ except ControlNotFoundError:
+ Control.add_to_form(self, form)
+ else:
+ control.merge_control(self)
+
+ def merge_control(self, control):
+ assert bool(control.multiple) == bool(self.multiple)
+ #assert isinstance(control, self.__class__)
+ self.items.extend(control.items)
+
+ def fixup(self):
+ """
+ ListControls are built up from component list items (which are also
+ ListControls) during parsing. This method should be called after all
+ items have been added. See ListControl.__doc__ for the reason this is
+ required.
+
+ """
+ # Need to set default selection where no item was indicated as being
+ # selected by the HTML:
+
+ # CHECKBOX:
+ # Nothing should be selected.
+ # SELECT/single, SELECT/multiple and RADIO:
+ # RFC 1866 (HTML 2.0): says first item should be selected.
+ # W3C HTML 4.01 Specification: says that client behaviour is
+ # undefined in this case. For RADIO, exactly one must be selected,
+ # though which one is undefined.
+ # Both Netscape and Microsoft Internet Explorer (IE) choose first
+ # item for SELECT/single. However, both IE5 and Mozilla (both 1.0
+ # and Firebird 0.6) leave all items unselected for RADIO and
+ # SELECT/multiple.
+
+ # Since both Netscape and IE all choose the first item for
+ # SELECT/single, we do the same. OTOH, both Netscape and IE
+ # leave SELECT/multiple with nothing selected, in violation of RFC 1866
+ # (but not in violation of the W3C HTML 4 standard); the same is true
+ # of RADIO (which *is* in violation of the HTML 4 standard). We follow
+ # RFC 1866 if the _select_default attribute is set, and Netscape and IE
+ # otherwise. RFC 1866 and HTML 4 are always violated insofar as you
+ # can deselect all items in a RadioControl.
+
+ for o in self.items:
+ # set items' controls to self, now that we've merged
+ o.__dict__["_control"] = self
+
+ def __getattr__(self, name):
+ if name == "value":
+ compat = self._form.backwards_compat
+ return [o.name for o in self.items if o.selected and
+ (not o.disabled or compat)]
+ else:
+ raise AttributeError("%s instance has no attribute '%s'" %
+ (self.__class__.__name__, name))
+
+ def __setattr__(self, name, value):
+ if name == "value":
+ if self.disabled:
+ raise AttributeError("control '%s' is disabled" % self.name)
+ if self.readonly:
+ raise AttributeError("control '%s' is readonly" % self.name)
+ self._set_value(value)
+ elif name in ("name", "type", "multiple"):
+ raise AttributeError("%s attribute is readonly" % name)
+ else:
+ self.__dict__[name] = value
+
+ def _set_value(self, value):
+ if value is None or isstringlike(value):
+ raise TypeError("ListControl, must set a sequence")
+ if not value:
+ compat = self._form.backwards_compat
+ for o in self.items:
+ if not o.disabled or compat:
+ o.selected = False
+ elif self.multiple:
+ self._multiple_set_value(value)
+ elif len(value) > 1:
+ raise ItemCountError(
+ "single selection list, must set sequence of "
+ "length 0 or 1")
+ else:
+ self._single_set_value(value)
+
+ def _get_items(self, name, target=1):
+ all_items = self.get_items(name)
+ items = [o for o in all_items if not o.disabled]
+ if len(items) < target:
+ if len(all_items) < target:
+ raise ItemNotFoundError(
+ "insufficient items with name %r" % name)
+ else:
+ raise AttributeError(
+ "insufficient non-disabled items with name %s" % name)
+ on = []
+ off = []
+ for o in items:
+ if o.selected:
+ on.append(o)
+ else:
+ off.append(o)
+ return on, off
+
+ def _single_set_value(self, value):
+ assert len(value) == 1
+ on, off = self._get_items(value[0])
+ assert len(on) <= 1
+ if not on:
+ off[0].selected = True
+
+ def _multiple_set_value(self, value):
+ compat = self._form.backwards_compat
+ turn_on = [] # transactional-ish
+ turn_off = [item for item in self.items if
+ item.selected and (not item.disabled or compat)]
+ names = {}
+ for nn in value:
+ if nn in names.keys():
+ names[nn] += 1
+ else:
+ names[nn] = 1
+ for name, count in names.items():
+ on, off = self._get_items(name, count)
+ for i in range(count):
+ if on:
+ item = on[0]
+ del on[0]
+ del turn_off[turn_off.index(item)]
+ else:
+ item = off[0]
+ del off[0]
+ turn_on.append(item)
+ for item in turn_off:
+ item.selected = False
+ for item in turn_on:
+ item.selected = True
+
+ def set_value_by_label(self, value):
+ """Set the value of control by item labels.
+
+ value is expected to be an iterable of strings that are substrings of
+ the item labels that should be selected. Before substring matching is
+ performed, the original label text is whitespace-compressed
+ (consecutive whitespace characters are converted to a single space
+ character) and leading and trailing whitespace is stripped. Ambiguous
+ labels are accepted without complaint if the form's backwards_compat is
+ True; otherwise, it will not complain as long as all ambiguous labels
+ share the same item name (e.g. OPTION value).
+
+ """
+ if isstringlike(value):
+ raise TypeError(value)
+ if not self.multiple and len(value) > 1:
+ raise ItemCountError(
+ "single selection list, must set sequence of "
+ "length 0 or 1")
+ items = []
+ for nn in value:
+ found = self.get_items(label=nn)
+ if len(found) > 1:
+ if not self._form.backwards_compat:
+ # ambiguous labels are fine as long as item names (e.g.
+ # OPTION values) are same
+ opt_name = found[0].name
+ if [o for o in found[1:] if o.name != opt_name]:
+ raise AmbiguityError(nn)
+ else:
+ # OK, we'll guess :-( Assume first available item.
+ found = found[:1]
+ for o in found:
+ # For the multiple-item case, we could try to be smarter,
+ # saving them up and trying to resolve, but that's too much.
+ if self._form.backwards_compat or o not in items:
+ items.append(o)
+ break
+ else: # all of them are used
+ raise ItemNotFoundError(nn)
+ # now we have all the items that should be on
+ # let's just turn everything off and then back on.
+ self.value = []
+ for o in items:
+ o.selected = True
+
+ def get_value_by_label(self):
+ """Return the value of the control as given by normalized labels."""
+ res = []
+ compat = self._form.backwards_compat
+ for o in self.items:
+ if (not o.disabled or compat) and o.selected:
+ for l in o.get_labels():
+ if l.text:
+ res.append(l.text)
+ break
+ else:
+ res.append(None)
+ return res
+
+ def possible_items(self, by_label=False):
+ """Deprecated: return the names or labels of all possible items.
+
+ Includes disabled items, which may be misleading for some use cases.
+
+ """
+ deprecation(
+ "[item.name for item in self.items]")
+ if by_label:
+ res = []
+ for o in self.items:
+ for l in o.get_labels():
+ if l.text:
+ res.append(l.text)
+ break
+ else:
+ res.append(None)
+ return res
+ return [o.name for o in self.items]
+
+ def _totally_ordered_pairs(self):
+ if self.disabled:
+ return []
+ else:
+ return [(o._index, self.name, o.name) for o in self.items
+ if o.selected and not o.disabled]
+
+ def __str__(self):
+ name = self.name
+ if name is None: name = "<None>"
+
+ display = [str(o) for o in self.items]
+
+ infos = []
+ if self.disabled: infos.append("disabled")
+ if self.readonly: infos.append("readonly")
+ info = ", ".join(infos)
+ if info: info = " (%s)" % info
+
+ return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
+ name, ", ".join(display), info)
+
+
+class RadioControl(ListControl):
+ """
+ Covers:
+
+ INPUT/RADIO
+
+ """
+ def __init__(self, type, name, attrs, select_default=False, index=None):
+ attrs.setdefault("value", "on")
+ ListControl.__init__(self, type, name, attrs, select_default,
+ called_as_base_class=True, index=index)
+ self.__dict__["multiple"] = False
+ o = Item(self, attrs, index)
+ o.__dict__["_selected"] = attrs.has_key("checked")
+
+ def fixup(self):
+ ListControl.fixup(self)
+ found = [o for o in self.items if o.selected and not o.disabled]
+ if not found:
+ if self._select_default:
+ for o in self.items:
+ if not o.disabled:
+ o.selected = True
+ break
+ else:
+ # Ensure only one item selected. Choose the last one,
+ # following IE and Firefox.
+ for o in found[:-1]:
+ o.selected = False
+
+ def get_labels(self):
+ return []
+
+class CheckboxControl(ListControl):
+ """
+ Covers:
+
+ INPUT/CHECKBOX
+
+ """
+ def __init__(self, type, name, attrs, select_default=False, index=None):
+ attrs.setdefault("value", "on")
+ ListControl.__init__(self, type, name, attrs, select_default,
+ called_as_base_class=True, index=index)
+ self.__dict__["multiple"] = True
+ o = Item(self, attrs, index)
+ o.__dict__["_selected"] = attrs.has_key("checked")
+
+ def get_labels(self):
+ return []
+
+
+class SelectControl(ListControl):
+ """
+ Covers:
+
+ SELECT (and OPTION)
+
+ SELECT control values and labels are subject to some messy defaulting
+ rules. For example, if the HTML representation of the control is:
+
+ <SELECT name=year>
+ <OPTION value=0 label="2002">current year</OPTION>
+ <OPTION value=1>2001</OPTION>
+ <OPTION>2000</OPTION>
+ </SELECT>
+
+ The items, in order, have labels "2002", "2001" and "2000", whereas their
+ values are "0", "1" and "2000" respectively. Note that the value of the
+ last OPTION in this example defaults to its contents, as specified by RFC
+ 1866, as do the labels of the second and third OPTIONs.
+
+ The OPTION labels are sometimes more meaningful than the OPTION values,
+ which can make for more maintainable code.
+
+ Additional read-only public attribute: attrs
+
+ The attrs attribute is a dictionary of the original HTML attributes of the
+ SELECT element. Other ListControls do not have this attribute, because in
+ other cases the control as a whole does not correspond to any single HTML
+ element. The get_item_attrs method may be used as usual to get at the
+ HTML attributes of the HTML elements corresponding to individual list items
+ (for SELECT controls, these are OPTION elements).
+
+ Another special case is that the attributes dictionaries returned by
+ get_item_attrs have a special key "contents" which does not correspond to
+ any real HTML attribute, but rather contains the contents of the OPTION
+ element:
+
+ <OPTION>this bit</OPTION>
+
+ """
+ # HTML attributes here are treated slightly differently from other list
+ # controls:
+ # -The SELECT HTML attributes dictionary is stuffed into the OPTION
+ # HTML attributes dictionary under the "__select" key.
+ # -The content of each OPTION element is stored under the special
+ # "contents" key of the dictionary.
+ # After all this, the dictionary is passed to the SelectControl constructor
+ # as the attrs argument, as usual. However:
+ # -The first SelectControl constructed when building up a SELECT control
+ # has a constructor attrs argument containing only the __select key -- so
+ # this SelectControl represents an empty SELECT control.
+ # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
+ # the __select dictionary containing the SELECT HTML-attributes.
+
+ def __init__(self, type, name, attrs, select_default=False, index=None):
+ # fish out the SELECT HTML attributes from the OPTION HTML attributes
+ # dictionary
+ self.attrs = attrs["__select"].copy()
+ self.__dict__["_label"] = _getLabel(self.attrs)
+ self.__dict__["id"] = self.attrs.get("id")
+ self.__dict__["multiple"] = self.attrs.has_key("multiple")
+ # the majority of the contents, label, and value dance already happened
+ contents = attrs.get("contents")
+ attrs = attrs.copy()
+ del attrs["__select"]
+
+ ListControl.__init__(self, type, name, self.attrs, select_default,
+ called_as_base_class=True, index=index)
+ self.disabled = self.attrs.has_key("disabled")
+ self.readonly = self.attrs.has_key("readonly")
+ if attrs.has_key("value"):
+ # otherwise it is a marker 'select started' token
+ o = Item(self, attrs, index)
+ o.__dict__["_selected"] = attrs.has_key("selected")
+ # add 'label' label and contents label, if different. If both are
+ # provided, the 'label' label is used for display in HTML
+ # 4.0-compliant browsers (and any lower spec? not sure) while the
+ # contents are used for display in older or less-compliant
+ # browsers. We make label objects for both, if the values are
+ # different.
+ label = attrs.get("label")
+ if label:
+ o._labels.append(Label({"__text": label}))
+ if contents and contents != label:
+ o._labels.append(Label({"__text": contents}))
+ elif contents:
+ o._labels.append(Label({"__text": contents}))
+
+ def fixup(self):
+ ListControl.fixup(self)
+ # Firefox doesn't exclude disabled items from those considered here
+ # (i.e. from 'found', for both brances of the if below). Note that IE
+ # doesn't support the disabled attribute on OPTIONs at all.
+ found = [o for o in self.items if o.selected]
+ if not found:
+ if not self.multiple or self._select_default:
+ for o in self.items:
+ if not o.disabled:
+ o.selected = True
+ break
+ elif not self.multiple:
+ # Ensure only one item selected. Choose the last one,
+ # following IE and Firefox.
+ for o in found[:-1]:
+ o.selected = False
+
+
+#---------------------------------------------------
+class SubmitControl(ScalarControl):
+ """
+ Covers:
+
+ INPUT/SUBMIT
+ BUTTON/SUBMIT
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ ScalarControl.__init__(self, type, name, attrs, index)
+ # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
+ # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
+ # to define this.
+ if self.value is None: self.value = ""
+ self.readonly = True
+
+ def get_labels(self):
+ res = []
+ if self.value:
+ res.append(Label({"__text": self.value}))
+ res.extend(ScalarControl.get_labels(self))
+ return res
+
+ def is_of_kind(self, kind): return kind == "clickable"
+
+ def _click(self, form, coord, return_type, request_class=urllib2.Request):
+ self._clicked = coord
+ r = form._switch_click(return_type, request_class)
+ self._clicked = False
+ return r
+
+ def _totally_ordered_pairs(self):
+ if not self._clicked:
+ return []
+ return ScalarControl._totally_ordered_pairs(self)
+
+
+#---------------------------------------------------
+class ImageControl(SubmitControl):
+ """
+ Covers:
+
+ INPUT/IMAGE
+
+ Coordinates are specified using one of the HTMLForm.click* methods.
+
+ """
+ def __init__(self, type, name, attrs, index=None):
+ SubmitControl.__init__(self, type, name, attrs, index)
+ self.readonly = False
+
+ def _totally_ordered_pairs(self):
+ clicked = self._clicked
+ if self.disabled or not clicked:
+ return []
+ name = self.name
+ if name is None: return []
+ pairs = [
+ (self._index, "%s.x" % name, str(clicked[0])),
+ (self._index, "%s.y" % name, str(clicked[1])),
+ ]
+ value = self._value
+ if value:
+ pairs.append((self._index, name, value))
+ return pairs
+
+ get_labels = ScalarControl.get_labels
+
+# aliases, just to make str(control) and str(form) clearer
+class PasswordControl(TextControl): pass
+class HiddenControl(TextControl): pass
+class TextareaControl(TextControl): pass
+class SubmitButtonControl(SubmitControl): pass
+
+
+def is_listcontrol(control): return control.is_of_kind("list")
+
+
+class HTMLForm:
+ """Represents a single HTML <form> ... </form> element.
+
+ A form consists of a sequence of controls that usually have names, and
+ which can take on various values. The values of the various types of
+ controls represent variously: text, zero-or-one-of-many or many-of-many
+ choices, and files to be uploaded. Some controls can be clicked on to
+ submit the form, and clickable controls' values sometimes include the
+ coordinates of the click.
+
+ Forms can be filled in with data to be returned to the server, and then
+ submitted, using the click method to generate a request object suitable for
+ passing to urllib2.urlopen (or the click_request_data or click_pairs
+ methods if you're not using urllib2).
+
+ import ClientForm
+ forms = ClientForm.ParseFile(html, base_uri)
+ form = forms[0]
+
+ form["query"] = "Python"
+ form.find_control("nr_results").get("lots").selected = True
+
+ response = urllib2.urlopen(form.click())
+
+ Usually, HTMLForm instances are not created directly. Instead, the
+ ParseFile or ParseResponse factory functions are used. If you do construct
+ HTMLForm objects yourself, however, note that an HTMLForm instance is only
+ properly initialised after the fixup method has been called (ParseFile and
+ ParseResponse do this for you). See ListControl.__doc__ for the reason
+ this is required.
+
+ Indexing a form (form["control_name"]) returns the named Control's value
+ attribute. Assignment to a form index (form["control_name"] = something)
+ is equivalent to assignment to the named Control's value attribute. If you
+ need to be more specific than just supplying the control's name, use the
+ set_value and get_value methods.
+
+ ListControl values are lists of item names. The list item's name is the
+ value of the corresponding HTML element's "value" attribute.
+
+ Example:
+
+ <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
+ <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
+
+ defines a CHECKBOX control with name "cheeses" which has two items, named
+ "leicester" and "cheddar".
+
+ Another example:
+
+ <SELECT name="more_cheeses">
+ <OPTION>1</OPTION>
+ <OPTION value="2" label="CHEDDAR">cheddar</OPTION>
+ </SELECT>
+
+ defines a SELECT control with name "more_cheeses" which has two items,
+ named "1" and "2" (because the OPTION element's value HTML attribute
+ defaults to the element contents).
+
+ To select, deselect or otherwise manipulate individual list items, use the
+ HTMLForm.find_control() and ListControl.get() methods. To set the whole
+ value, do as for any other control:use indexing or the set_/get_value
+ methods.
+
+ Example:
+
+ # select *only* the item named "cheddar"
+ form["cheeses"] = ["cheddar"]
+ # select "cheddar", leave other items unaffected
+ form.find_control("cheeses").get("cheddar").selected = True
+
+ Some controls (RADIO and SELECT without the multiple attribute) can only
+ have zero or one items selected at a time. Some controls (CHECKBOX and
+ SELECT with the multiple attribute) can have multiple items selected at a
+ time. To set the whole value of a ListControl, assign a sequence to a form
+ index:
+
+ form["cheeses"] = ["cheddar", "leicester"]
+
+ If the ListControl is not multiple-selection, the assigned list must be of
+ length one.
+
+ To check if a control has an item, if an item is selected, or if an item is
+ successful (selected and not disabled), respectively:
+
+ "cheddar" in [item.name for item in form.find_control("cheeses").items]
+ "cheddar" in [item.name for item in form.find_control("cheeses").items and
+ item.selected]
+ "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
+
+ Note that some list items may be disabled (see below).
+
+ Note the following mistake:
+
+ form[control_name] = control_value
+ assert form[control_name] == control_value # not necessarily true
+
+ The reason for this is that form[control_name] always gives the list items
+ in the order they were listed in the HTML.
+
+ List items (hence list values, too) can be referred to in terms of list
+ item labels rather than list item names using the appropriate label
+ arguments. Note that each item may have several labels.
+
+ The question of default values of OPTION contents, labels and values is
+ somewhat complicated: see SelectControl.__doc__ and
+ ListControl.get_item_attrs.__doc__ if you think you need to know.
+
+ Controls can be disabled or readonly. In either case, the control's value
+ cannot be changed until you clear those flags (see example below).
+ Disabled is the state typically represented by browsers by 'greying out' a
+ control. Disabled controls are not 'successful' -- they don't cause data
+ to get returned to the server. Readonly controls usually appear in
+ browsers as read-only text boxes. Readonly controls are successful. List
+ items can also be disabled. Attempts to select or deselect disabled items
+ fail with AttributeError.
+
+ If a lot of controls are readonly, it can be useful to do this:
+
+ form.set_all_readonly(False)
+
+ To clear a control's value attribute, so that it is not successful (until a
+ value is subsequently set):
+
+ form.clear("cheeses")
+
+ More examples:
+
+ control = form.find_control("cheeses")
+ control.disabled = False
+ control.readonly = False
+ control.get("gruyere").disabled = True
+ control.items[0].selected = True
+
+ See the various Control classes for further documentation. Many methods
+ take name, type, kind, id, label and nr arguments to specify the control to
+ be operated on: see HTMLForm.find_control.__doc__.
+
+ ControlNotFoundError (subclass of ValueError) is raised if the specified
+ control can't be found. This includes occasions where a non-ListControl
+ is found, but the method (set, for example) requires a ListControl.
+ ItemNotFoundError (subclass of ValueError) is raised if a list item can't
+ be found. ItemCountError (subclass of ValueError) is raised if an attempt
+ is made to select more than one item and the control doesn't allow that, or
+ set/get_single are called and the control contains more than one item.
+ AttributeError is raised if a control or item is readonly or disabled and
+ an attempt is made to alter its value.
+
+ Security note: Remember that any passwords you store in HTMLForm instances
+ will be saved to disk in the clear if you pickle them (directly or
+ indirectly). The simplest solution to this is to avoid pickling HTMLForm
+ objects. You could also pickle before filling in any password, or just set
+ the password to "" before pickling.
+
+
+ Public attributes:
+
+ action: full (absolute URI) form action
+ method: "GET" or "POST"
+ enctype: form transfer encoding MIME type
+ name: name of form (None if no name was specified)
+ attrs: dictionary mapping original HTML form attributes to their values
+
+ controls: list of Control instances; do not alter this list
+ (instead, call form.new_control to make a Control and add it to the
+ form, or control.add_to_form if you already have a Control instance)
+
+
+
+ Methods for form filling:
+ -------------------------
+
+ Most of the these methods have very similar arguments. See
+ HTMLForm.find_control.__doc__ for details of the name, type, kind, label
+ and nr arguments.
+
+ def find_control(self,
+ name=None, type=None, kind=None, id=None, predicate=None,
+ nr=None, label=None)
+
+ get_value(name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, # by_label is deprecated
+ label=None)
+ set_value(value,
+ name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, # by_label is deprecated
+ label=None)
+
+ clear_all()
+ clear(name=None, type=None, kind=None, id=None, nr=None, label=None)
+
+ set_all_readonly(readonly)
+
+
+ Method applying only to FileControls:
+
+ add_file(file_object,
+ content_type="application/octet-stream", filename=None,
+ name=None, id=None, nr=None, label=None)
+
+
+ Methods applying only to clickable controls:
+
+ click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
+ click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),
+ label=None)
+ click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
+
+ """
+
+ type2class = {
+ "text": TextControl,
+ "password": PasswordControl,
+ "hidden": HiddenControl,
+ "textarea": TextareaControl,
+
+ "isindex": IsindexControl,
+
+ "file": FileControl,
+
+ "button": IgnoreControl,
+ "buttonbutton": IgnoreControl,
+ "reset": IgnoreControl,
+ "resetbutton": IgnoreControl,
+
+ "submit": SubmitControl,
+ "submitbutton": SubmitButtonControl,
+ "image": ImageControl,
+
+ "radio": RadioControl,
+ "checkbox": CheckboxControl,
+ "select": SelectControl,
+ }
+
+#---------------------------------------------------
+# Initialisation. Use ParseResponse / ParseFile instead.
+
+ def __init__(self, action, method="GET",
+ enctype="application/x-www-form-urlencoded",
+ name=None, attrs=None,
+ request_class=urllib2.Request,
+ forms=None, labels=None, id_to_labels=None,
+ backwards_compat=True):
+ """
+ In the usual case, use ParseResponse (or ParseFile) to create new
+ HTMLForm objects.
+
+ action: full (absolute URI) form action
+ method: "GET" or "POST"
+ enctype: form transfer encoding MIME type
+ name: name of form
+ attrs: dictionary mapping original HTML form attributes to their values
+
+ """
+ self.action = action
+ self.method = method
+ self.enctype = enctype
+ self.name = name
+ if attrs is not None:
+ self.attrs = attrs.copy()
+ else:
+ self.attrs = {}
+ self.controls = []
+ self._request_class = request_class
+
+ # these attributes are used by zope.testbrowser
+ self._forms = forms # this is a semi-public API!
+ self._labels = labels # this is a semi-public API!
+ self._id_to_labels = id_to_labels # this is a semi-public API!
+
+ self.backwards_compat = backwards_compat # note __setattr__
+
+ def __getattr__(self, name):
+ if name == "backwards_compat":
+ return self._backwards_compat
+ return getattr(HTMLForm, name)
+
+ def __setattr__(self, name, value):
+ # yuck
+ if name == "backwards_compat":
+ name = "_backwards_compat"
+ value = bool(value)
+ for cc in self.controls:
+ try:
+ items = cc.items
+ except AttributeError:
+ continue
+ else:
+ for ii in items:
+ for ll in ii.get_labels():
+ ll._backwards_compat = value
+ self.__dict__[name] = value
+
+ def new_control(self, type, name, attrs,
+ ignore_unknown=False, select_default=False, index=None):
+ """Adds a new control to the form.
+
+ This is usually called by ParseFile and ParseResponse. Don't call it
+ youself unless you're building your own Control instances.
+
+ Note that controls representing lists of items are built up from
+ controls holding only a single list item. See ListControl.__doc__ for
+ further information.
+
+ type: type of control (see Control.__doc__ for a list)
+ attrs: HTML attributes of control
+ ignore_unknown: if true, use a dummy Control instance for controls of
+ unknown type; otherwise, use a TextControl
+ select_default: for RADIO and multiple-selection SELECT controls, pick
+ the first item as the default if no 'selected' HTML attribute is
+ present (this defaulting happens when the HTMLForm.fixup method is
+ called)
+ index: index of corresponding element in HTML (see
+ MoreFormTests.test_interspersed_controls for motivation)
+
+ """
+ type = type.lower()
+ klass = self.type2class.get(type)
+ if klass is None:
+ if ignore_unknown:
+ klass = IgnoreControl
+ else:
+ klass = TextControl
+
+ a = attrs.copy()
+ if issubclass(klass, ListControl):
+ control = klass(type, name, a, select_default, index)
+ else:
+ control = klass(type, name, a, index)
+ control.add_to_form(self)
+
+ def fixup(self):
+ """Normalise form after all controls have been added.
+
+ This is usually called by ParseFile and ParseResponse. Don't call it
+ youself unless you're building your own Control instances.
+
+ This method should only be called once, after all controls have been
+ added to the form.
+
+ """
+ for control in self.controls:
+ control.fixup()
+ self.backwards_compat = self._backwards_compat
+
+#---------------------------------------------------
+ def __str__(self):
+ header = "%s %s %s" % (self.method, self.action, self.enctype)
+ rep = [header]
+ for control in self.controls:
+ rep.append(" %s" % str(control))
+ return "<%s>" % "\n".join(rep)
+
+#---------------------------------------------------
+# Form-filling methods.
+
+ def __getitem__(self, name):
+ return self.find_control(name).value
+ def __contains__(self, name):
+ return bool(self.find_control(name))
+ def __setitem__(self, name, value):
+ control = self.find_control(name)
+ try:
+ control.value = value
+ except AttributeError, e:
+ raise ValueError(str(e))
+
+ def get_value(self,
+ name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, # by_label is deprecated
+ label=None):
+ """Return value of control.
+
+ If only name and value arguments are supplied, equivalent to
+
+ form[name]
+
+ """
+ if by_label:
+ deprecation("form.get_value_by_label(...)")
+ c = self.find_control(name, type, kind, id, label=label, nr=nr)
+ if by_label:
+ try:
+ meth = c.get_value_by_label
+ except AttributeError:
+ raise NotImplementedError(
+ "control '%s' does not yet support by_label" % c.name)
+ else:
+ return meth()
+ else:
+ return c.value
+ def set_value(self, value,
+ name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, # by_label is deprecated
+ label=None):
+ """Set value of control.
+
+ If only name and value arguments are supplied, equivalent to
+
+ form[name] = value
+
+ """
+ if by_label:
+ deprecation("form.get_value_by_label(...)")
+ c = self.find_control(name, type, kind, id, label=label, nr=nr)
+ if by_label:
+ try:
+ meth = c.set_value_by_label
+ except AttributeError:
+ raise NotImplementedError(
+ "control '%s' does not yet support by_label" % c.name)
+ else:
+ meth(value)
+ else:
+ c.value = value
+ def get_value_by_label(
+ self, name=None, type=None, kind=None, id=None, label=None, nr=None):
+ """
+
+ All arguments should be passed by name.
+
+ """
+ c = self.find_control(name, type, kind, id, label=label, nr=nr)
+ return c.get_value_by_label()
+
+ def set_value_by_label(
+ self, value,
+ name=None, type=None, kind=None, id=None, label=None, nr=None):
+ """
+
+ All arguments should be passed by name.
+
+ """
+ c = self.find_control(name, type, kind, id, label=label, nr=nr)
+ c.set_value_by_label(value)
+
+ def set_all_readonly(self, readonly):
+ for control in self.controls:
+ control.readonly = bool(readonly)
+
+ def clear_all(self):
+ """Clear the value attributes of all controls in the form.
+
+ See HTMLForm.clear.__doc__.
+
+ """
+ for control in self.controls:
+ control.clear()
+
+ def clear(self,
+ name=None, type=None, kind=None, id=None, nr=None, label=None):
+ """Clear the value attribute of a control.
+
+ As a result, the affected control will not be successful until a value
+ is subsequently set. AttributeError is raised on readonly controls.
+
+ """
+ c = self.find_control(name, type, kind, id, label=label, nr=nr)
+ c.clear()
+
+
+#---------------------------------------------------
+# Form-filling methods applying only to ListControls.
+
+ def possible_items(self, # deprecated
+ name=None, type=None, kind=None, id=None,
+ nr=None, by_label=False, label=None):
+ """Return a list of all values that the specified control can take."""
+ c = self._find_list_control(name, type, kind, id, label, nr)
+ return c.possible_items(by_label)
+
+ def set(self, selected, item_name, # deprecated
+ name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, label=None):
+ """Select / deselect named list item.
+
+ selected: boolean selected state
+
+ """
+ self._find_list_control(name, type, kind, id, label, nr).set(
+ selected, item_name, by_label)
+ def toggle(self, item_name, # deprecated
+ name=None, type=None, kind=None, id=None, nr=None,
+ by_label=False, label=None):
+ """Toggle selected state of named list item."""
+ self._find_list_control(name, type, kind, id, label, nr).toggle(
+ item_name, by_label)
+
+ def set_single(self, selected, # deprecated
+ name=None, type=None, kind=None, id=None,
+ nr=None, by_label=None, label=None):
+ """Select / deselect list item in a control having only one item.
+
+ If the control has multiple list items, ItemCountError is raised.
+
+ This is just a convenience method, so you don't need to know the item's
+ name -- the item name in these single-item controls is usually
+ something meaningless like "1" or "on".
+
+ For example, if a checkbox has a single item named "on", the following
+ two calls are equivalent:
+
+ control.toggle("on")
+ control.toggle_single()
+
+ """ # by_label ignored and deprecated
+ self._find_list_control(
+ name, type, kind, id, label, nr).set_single(selected)
+ def toggle_single(self, name=None, type=None, kind=None, id=None,
+ nr=None, by_label=None, label=None): # deprecated
+ """Toggle selected state of list item in control having only one item.
+
+ The rest is as for HTMLForm.set_single.__doc__.
+
+ """ # by_label ignored and deprecated
+ self._find_list_control(name, type, kind, id, label, nr).toggle_single()
+
+#---------------------------------------------------
+# Form-filling method applying only to FileControls.
+
+ def add_file(self, file_object, content_type=None, filename=None,
+ name=None, id=None, nr=None, label=None):
+ """Add a file to be uploaded.
+
+ file_object: file-like object (with read method) from which to read
+ data to upload
+ content_type: MIME content type of data to upload
+ filename: filename to pass to server
+
+ If filename is None, no filename is sent to the server.
+
+ If content_type is None, the content type is guessed based on the
+ filename and the data from read from the file object.
+
+ XXX
+ At the moment, guessed content type is always application/octet-stream.
+ Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
+ plain text.
+
+ Note the following useful HTML attributes of file upload controls (see
+ HTML 4.01 spec, section 17):
+
+ accept: comma-separated list of content types that the server will
+ handle correctly; you can use this to filter out non-conforming files
+ size: XXX IIRC, this is indicative of whether form wants multiple or
+ single files
+ maxlength: XXX hint of max content length in bytes?
+
+ """
+ self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
+ file_object, content_type, filename)
+
+#---------------------------------------------------
+# Form submission methods, applying only to clickable controls.
+
+ def click(self, name=None, type=None, id=None, nr=0, coord=(1,1),
+ request_class=urllib2.Request,
+ label=None):
+ """Return request that would result from clicking on a control.
+
+ The request object is a urllib2.Request instance, which you can pass to
+ urllib2.urlopen (or ClientCookie.urlopen).
+
+ Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
+ IMAGEs) can be clicked.
+
+ Will click on the first clickable control, subject to the name, type
+ and nr arguments (as for find_control). If no name, type, id or number
+ is specified and there are no clickable controls, a request will be
+ returned for the form in its current, un-clicked, state.
+
+ IndexError is raised if any of name, type, id or nr is specified but no
+ matching control is found. ValueError is raised if the HTMLForm has an
+ enctype attribute that is not recognised.
+
+ You can optionally specify a coordinate to click at, which only makes a
+ difference if you clicked on an image.
+
+ """
+ return self._click(name, type, id, label, nr, coord, "request",
+ self._request_class)
+
+ def click_request_data(self,
+ name=None, type=None, id=None,
+ nr=0, coord=(1,1),
+ request_class=urllib2.Request,
+ label=None):
+ """As for click method, but return a tuple (url, data, headers).
+
+ You can use this data to send a request to the server. This is useful
+ if you're using httplib or urllib rather than urllib2. Otherwise, use
+ the click method.
+
+ # Untested. Have to subclass to add headers, I think -- so use urllib2
+ # instead!
+ import urllib
+ url, data, hdrs = form.click_request_data()
+ r = urllib.urlopen(url, data)
+
+ # Untested. I don't know of any reason to use httplib -- you can get
+ # just as much control with urllib2.
+ import httplib, urlparse
+ url, data, hdrs = form.click_request_data()
+ tup = urlparse(url)
+ host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
+ conn = httplib.HTTPConnection(host)
+ if data:
+ httplib.request("POST", path, data, hdrs)
+ else:
+ httplib.request("GET", path, headers=hdrs)
+ r = conn.getresponse()
+
+ """
+ return self._click(name, type, id, label, nr, coord, "request_data",
+ self._request_class)
+
+ def click_pairs(self, name=None, type=None, id=None,
+ nr=0, coord=(1,1),
+ label=None):
+ """As for click_request_data, but returns a list of (key, value) pairs.
+
+ You can use this list as an argument to ClientForm.urlencode. This is
+ usually only useful if you're using httplib or urllib rather than
+ urllib2 or ClientCookie. It may also be useful if you want to manually
+ tweak the keys and/or values, but this should not be necessary.
+ Otherwise, use the click method.
+
+ Note that this method is only useful for forms of MIME type
+ x-www-form-urlencoded. In particular, it does not return the
+ information required for file upload. If you need file upload and are
+ not using urllib2, use click_request_data.
+
+ Also note that Python 2.0's urllib.urlencode is slightly broken: it
+ only accepts a mapping, not a sequence of pairs, as an argument. This
+ messes up any ordering in the argument. Use ClientForm.urlencode
+ instead.
+
+ """
+ return self._click(name, type, id, label, nr, coord, "pairs",
+ self._request_class)
+
+#---------------------------------------------------
+
+ def find_control(self,
+ name=None, type=None, kind=None, id=None,
+ predicate=None, nr=None,
+ label=None):
+ """Locate and return some specific control within the form.
+
+ At least one of the name, type, kind, predicate and nr arguments must
+ be supplied. If no matching control is found, ControlNotFoundError is
+ raised.
+
+ If name is specified, then the control must have the indicated name.
+
+ If type is specified then the control must have the specified type (in
+ addition to the types possible for <input> HTML tags: "text",
+ "password", "hidden", "submit", "image", "button", "radio", "checkbox",
+ "file" we also have "reset", "buttonbutton", "submitbutton",
+ "resetbutton", "textarea", "select" and "isindex").
+
+ If kind is specified, then the control must fall into the specified
+ group, each of which satisfies a particular interface. The types are
+ "text", "list", "multilist", "singlelist", "clickable" and "file".
+
+ If id is specified, then the control must have the indicated id.
+
+ If predicate is specified, then the control must match that function.
+ The predicate function is passed the control as its single argument,
+ and should return a boolean value indicating whether the control
+ matched.
+
+ nr, if supplied, is the sequence number of the control (where 0 is the
+ first). Note that control 0 is the first control matching all the
+ other arguments (if supplied); it is not necessarily the first control
+ in the form. If no nr is supplied, AmbiguityError is raised if
+ multiple controls match the other arguments (unless the
+ .backwards-compat attribute is true).
+
+ If label is specified, then the control must have this label. Note
+ that radio controls and checkboxes never have labels: their items do.
+
+ """
+ if ((name is None) and (type is None) and (kind is None) and
+ (id is None) and (label is None) and (predicate is None) and
+ (nr is None)):
+ raise ValueError(
+ "at least one argument must be supplied to specify control")
+ return self._find_control(name, type, kind, id, label, predicate, nr)
+
+#---------------------------------------------------
+# Private methods.
+
+ def _find_list_control(self,
+ name=None, type=None, kind=None, id=None,
+ label=None, nr=None):
+ if ((name is None) and (type is None) and (kind is None) and
+ (id is None) and (label is None) and (nr is None)):
+ raise ValueError(
+ "at least one argument must be supplied to specify control")
+
+ return self._find_control(name, type, kind, id, label,
+ is_listcontrol, nr)
+
+ def _find_control(self, name, type, kind, id, label, predicate, nr):
+ if (name is not None) and not isstringlike(name):
+ raise TypeError("control name must be string-like")
+ if (type is not None) and not isstringlike(type):
+ raise TypeError("control type must be string-like")
+ if (kind is not None) and not isstringlike(kind):
+ raise TypeError("control kind must be string-like")
+ if (id is not None) and not isstringlike(id):
+ raise TypeError("control id must be string-like")
+ if (label is not None) and not isstringlike(label):
+ raise TypeError("control label must be string-like")
+ if (predicate is not None) and not callable(predicate):
+ raise TypeError("control predicate must be callable")
+ if (nr is not None) and nr < 0:
+ raise ValueError("control number must be a positive integer")
+
+ orig_nr = nr
+ found = None
+ ambiguous = False
+ if nr is None and self.backwards_compat:
+ nr = 0
+
+ for control in self.controls:
+ if name is not None and name != control.name:
+ continue
+ if type is not None and type != control.type:
+ continue
+ if kind is not None and not control.is_of_kind(kind):
+ continue
+ if id is not None and id != control.id:
+ continue
+ if predicate and not predicate(control):
+ continue
+ if label:
+ for l in control.get_labels():
+ if l.text.find(label) > -1:
+ break
+ else:
+ continue
+ if nr is not None:
+ if nr == 0:
+ return control # early exit: unambiguous due to nr
+ nr -= 1
+ continue
+ if found:
+ ambiguous = True
+ break
+ found = control
+
+ if found and not ambiguous:
+ return found
+
+ description = []
+ if name is not None: description.append("name '%s'" % name)
+ if type is not None: description.append("type '%s'" % type)
+ if kind is not None: description.append("kind '%s'" % kind)
+ if id is not None: description.append("id '%s'" % id)
+ if label is not None: description.append("label '%s'" % label)
+ if predicate is not None:
+ description.append("predicate %s" % predicate)
+ if orig_nr: description.append("nr %d" % orig_nr)
+ description = ", ".join(description)
+
+ if ambiguous:
+ raise AmbiguityError("more than one control matching "+description)
+ elif not found:
+ raise ControlNotFoundError("no control matching "+description)
+ assert False
+
+ def _click(self, name, type, id, label, nr, coord, return_type,
+ request_class=urllib2.Request):
+ try:
+ control = self._find_control(
+ name, type, "clickable", id, label, None, nr)
+ except ControlNotFoundError:
+ if ((name is not None) or (type is not None) or (id is not None) or
+ (nr != 0)):
+ raise
+ # no clickable controls, but no control was explicitly requested,
+ # so return state without clicking any control
+ return self._switch_click(return_type, request_class)
+ else:
+ return control._click(self, coord, return_type, request_class)
+
+ def _pairs(self):
+ """Return sequence of (key, value) pairs suitable for urlencoding."""
+ opairs = []
+ for control in self.controls:
+ opairs.extend(control._totally_ordered_pairs())
+
+ # stable sort by ONLY first item in tuple
+ sorter = []
+ for jj in range(len(opairs)):
+ ii, key, val = opairs[jj]
+ sorter.append((ii, jj, key, val))
+ sorter.sort()
+ pairs = [(key, val) for (ii, jj, key, val) in sorter]
+
+ return pairs
+
+ def _request_data(self):
+ """Return a tuple (url, data, headers)."""
+ method = self.method.upper()
+ #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action)
+ parts = urlparse.urlparse(self.action)
+ rest, (query, frag) = parts[:-2], parts[-2:]
+
+ if method == "GET":
+ if self.enctype != "application/x-www-form-urlencoded":
+ raise ValueError(
+ "unknown GET form encoding type '%s'" % self.enctype)
+ parts = rest + (urlencode(self._pairs()), "")
+ uri = urlparse.urlunparse(parts)
+ return uri, None, []
+ elif method == "POST":
+ parts = rest + (query, "")
+ uri = urlparse.urlunparse(parts)
+ if self.enctype == "application/x-www-form-urlencoded":
+ return (uri, urlencode(self._pairs()),
+ [("Content-type", self.enctype)])
+ elif self.enctype == "multipart/form-data":
+ data = StringIO()
+ http_hdrs = []
+ mw = MimeWriter(data, http_hdrs)
+ f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
+ prefix=0)
+ for control in self.controls:
+ control._write_mime_data(mw)
+ mw.lastpart()
+ return uri, data.getvalue(), http_hdrs
+ else:
+ raise ValueError(
+ "unknown POST form encoding type '%s'" % self.enctype)
+ else:
+ raise ValueError("Unknown method '%s'" % method)
+
+ def _switch_click(self, return_type, request_class=urllib2.Request):
+ # This is called by HTMLForm and clickable Controls to hide switching
+ # on return_type.
+ if return_type == "pairs":
+ return self._pairs()
+ elif return_type == "request_data":
+ return self._request_data()
+ else:
+ req_data = self._request_data()
+ req = request_class(req_data[0], req_data[1])
+ for key, val in req_data[2]:
+ add_hdr = req.add_header
+ if key.lower() == 'content-type':
+ try:
+ add_hdr = req.add_unredirected_header
+ except AttributeError:
+ # pre-2.4 and not using ClientCookie
+ pass
+ add_hdr(key, val)
+ return req
Added: Zope3/trunk/src/mechanize/__init__.py
===================================================================
--- Zope3/trunk/src/mechanize/__init__.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/mechanize/__init__.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,4 @@
+from _useragent import UserAgent#, http_get, http_put, http_head
+from _mechanize import Browser, Link, FormsFactory, \
+ BrowserStateError, LinkNotFoundError, FormNotFoundError, \
+ __version__
Added: Zope3/trunk/src/mechanize/_mechanize.py
===================================================================
--- Zope3/trunk/src/mechanize/_mechanize.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/mechanize/_mechanize.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,642 @@
+"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
+
+Copyright 2003-2005 John J. Lee <jjl at pobox.com>
+Copyright 2003 Andy Lester (original Perl code)
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+# XXXX
+# test referer bugs (frags and don't add in redirect unless orig req had Referer)
+
+# XXX
+# The stuff on web page's todo list.
+# Moof's emails about response object, .back(), etc.
+
+from __future__ import generators
+
+import urllib2, urlparse, re, sys
+
+import ClientCookie
+from ClientCookie._Util import response_seek_wrapper
+from ClientCookie._HeadersUtil import split_header_words, is_html
+# serves me right for not using a version tuple...
+VERSION_RE = re.compile(r"(?P<major>\d+)\.(?P<minor>\d+)\.(?P<bugfix>\d+)"
+ r"(?P<state>[ab])?(?:-pre)?(?P<pre>\d+)?$")
+def parse_version(text):
+ m = VERSION_RE.match(text)
+ if m is None:
+ raise ValueError
+ return tuple([m.groupdict()[part] for part in
+ ("major", "minor", "bugfix", "state", "pre")])
+assert map(int, parse_version(ClientCookie.VERSION)[:3]) >= [1, 0, 3], \
+ "ClientCookie 1.0.3 or newer is required"
+
+from _useragent import UserAgent
+
+__version__ = (0, 0, 10, "a", None) # 0.0.10a
+
+class BrowserStateError(Exception): pass
+class LinkNotFoundError(Exception): pass
+class FormNotFoundError(Exception): pass
+
+class Link:
+ def __init__(self, base_url, url, text, tag, attrs):
+ assert None not in [url, tag, attrs]
+ self.base_url = base_url
+ self.absolute_url = urlparse.urljoin(base_url, url)
+ self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
+ def __cmp__(self, other):
+ try:
+ for name in "url", "text", "tag", "attrs":
+ if getattr(self, name) != getattr(other, name):
+ return -1
+ except AttributeError:
+ return -1
+ return 0
+ def __repr__(self):
+ return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
+ self.base_url, self.url, self.text, self.tag, self.attrs)
+
+
+class LinksFactory:
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import pullparser
+ assert pullparser.__version__[:3] >= (0, 0, 4), \
+ "pullparser 0.0.4b or newer is required"
+ if link_parser_class is None:
+ link_parser_class = pullparser.TolerantPullParser
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+
+ def links(self, fh, base_url, encoding=None):
+ """Return an iterator that provides links of the document."""
+ import pullparser
+ p = self.link_parser_class(fh, encoding=encoding)
+
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.data == "base":
+ base_url = dict(token.attrs).get("href")
+ continue
+ if token.type == "endtag":
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX need to sort out quoting
+ #url = urllib.quote_plus(attrs.get(self.urltags[tag]))
+ url = attrs.get(self.urltags[tag])
+ if tag == "a":
+ if token.type != "startendtag":
+ # XXX hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+ # This is a hack from WWW::Mechanize to get some really basic
+ # JavaScript working, which I'm not yet convinced is a good
+ # idea.
+## onClick = attrs["onclick"]
+## m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
+## if onClick and m:
+## url = m.group(1)
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so ignore
+ # this.
+ continue
+
+ yield Link(base_url, url, text, tag, token.attrs)
+
+class FormsFactory:
+
+ """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
+
+ For constructor argument docs, see ClientForm.ParseResponse
+ argument docs.
+
+ """
+
+ def __init__(self,
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ ):
+ import ClientForm
+ assert map(int, parse_version(ClientForm.VERSION)[:3]) >= [0, 2, 1], \
+ "ClientForm >= 0.2.1a is required"
+ self.select_default = select_default
+ if form_parser_class is None:
+ form_parser_class = ClientForm.FormParser
+ self.form_parser_class = form_parser_class
+ if request_class is None:
+ request_class = ClientCookie.Request
+ self.request_class = request_class
+ self.backwards_compat = backwards_compat
+
+ def parse_response(self, response):
+ import ClientForm
+ return ClientForm.ParseResponse(
+ response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ backwards_compat=self.backwards_compat,
+ )
+
+ def parse_file(self, file_obj, base_url):
+ import ClientForm
+ return ClientForm.ParseFile(
+ file_obj,
+ base_url,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ backwards_compat=self.backwards_compat,
+ )
+
+if sys.version_info[:2] >= (2, 4):
+ from ClientCookie._Opener import OpenerMixin
+else:
+ class OpenerMixin: pass
+
+class Browser(UserAgent, OpenerMixin):
+ """Browser-like class with support for history, forms and links.
+
+ BrowserStateError is raised whenever the browser is in the wrong state to
+ complete the requested operation - eg., when .back() is called when the
+ browser history is empty, or when .follow_link() is called when the current
+ response does not contain HTML data.
+
+ Public attributes:
+
+ request: last request (ClientCookie.Request or urllib2.Request)
+ form: currently selected form (see .select_form())
+ default_encoding: character encoding used for encoding numeric character
+ references when matching link text, if no encoding is found in the reponse
+ (you should turn on HTTP-EQUIV handling if you want the best chance of
+ getting this right without resorting to this default)
+
+ """
+
+ def __init__(self, default_encoding="latin-1",
+ forms_factory=None,
+ links_factory=None,
+ request_class=None,
+ ):
+ """
+
+ Only named arguments should be passed to this constructor.
+
+ default_encoding: See class docs.
+ forms_factory: Object supporting the mechanize.FormsFactory interface.
+ links_factory: Object supporting the mechanize.LinksFactory interface.
+ request_class: Request class to use. Defaults to ClientCookie.Request
+ by default for Pythons older than 2.4, urllib2.Request otherwise.
+
+ Note that the supplied forms_factory's request_class attribute is
+ assigned to by this constructor, to ensure only one Request class is
+ used.
+
+ """
+ self.default_encoding = default_encoding
+ self._history = [] # LIFO
+ self.request = self._response = None
+ self.form = None
+ self._forms = None
+ self._title = None
+ self._links = None
+
+ if request_class is None:
+ if not hasattr(urllib2.Request, "add_unredirected_header"):
+ request_class = ClientCookie.Request
+ else:
+ request_class = urllib2.Request # Python 2.4
+ self.request_class = request_class
+ if forms_factory is None:
+ forms_factory = FormsFactory()
+ self._forms_factory = forms_factory
+ forms_factory.request_class = request_class
+ if links_factory is None:
+ links_factory = LinksFactory()
+ self._links_factory = links_factory
+
+ UserAgent.__init__(self) # do this last to avoid __getattr__ problems
+
+ def close(self):
+ if self._response is not None:
+ self._response.close()
+ UserAgent.close(self)
+ self._history = self._forms = self._title = self._links = None
+ self.request = self._response = None
+
+ def open(self, url, data=None):
+ if self._response is not None:
+ self._response.close()
+ return self._mech_open(url, data)
+
+ def _mech_open(self, url, data=None, update_history=True):
+ try:
+ url.get_full_url
+ except AttributeError:
+ # string URL -- convert to absolute URL if required
+ scheme, netloc = urlparse.urlparse(url)[:2]
+ if not scheme:
+ # relative URL
+ assert not netloc, "malformed URL"
+ if self._response is None:
+ raise BrowserStateError(
+ "can't fetch relative URL: not viewing any document")
+ url = urlparse.urljoin(self._response.geturl(), url)
+
+ if self.request is not None and update_history:
+ self._history.append((self.request, self._response))
+ self._response = None
+ # we want self.request to be assigned even if UserAgent.open fails
+ self.request = self._request(url, data)
+ self._previous_scheme = self.request.get_type()
+
+ self._response = UserAgent.open(self, self.request, data)
+ if not hasattr(self._response, "seek"):
+ self._response = response_seek_wrapper(self._response)
+ self._parse_html(self._response)
+
+ return self._response
+
+ def response(self):
+ """Return last response (as return value of urllib2.urlopen())."""
+ # XXX This is currently broken: responses returned by this method
+ # all share the same seek position.
+ return self._response
+
+ def geturl(self):
+ """Get URL of current document."""
+ if self._response is None:
+ raise BrowserStateError("not viewing any document")
+ return self._response.geturl()
+
+ def reload(self):
+ """Reload current document, and return response object."""
+ if self.request is None:
+ raise BrowserStateError("no URL has yet been .open()ed")
+ return self._mech_open(self.request, update_history=False)
+
+ def back(self, n=1):
+ """Go back n steps in history, and return response object.
+
+ n: go back this number of steps (default 1 step)
+
+ """
+ if self._response is not None:
+ self._response.close()
+ while n:
+ try:
+ self.request, self._response = self._history.pop()
+ except IndexError:
+ raise BrowserStateError("already at start of history")
+ n -= 1
+ self._parse_html(self._response)
+ return self._response
+
+ def links(self, **kwds):
+ """Return iterable over links (mechanize.Link objects)."""
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ if kwds:
+ return self._find_links(False, **kwds)
+ if self._links is None:
+ try:
+ self._links = list(self.get_links_iter())
+ finally:
+ self._response.seek(0)
+ return self._links
+
+ def get_links_iter(self):
+ """Return an iterator that provides links of the document.
+
+ This method is provided in addition to .links() to allow lazy iteration
+ over links, while still keeping .links() safe against somebody
+ .seek()ing on a response "behind your back". When response objects are
+ fixed to have independent seek positions, this method will be
+ deprecated in favour of .links().
+
+ """
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ base_url = self._response.geturl()
+ self._response.seek(0)
+ return self._links_factory.links(
+ self._response, base_url, self._encoding(self._response))
+
+ def forms(self):
+ """Return iterable over forms.
+
+ The returned form objects implement the ClientForm.HTMLForm interface.
+
+ """
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ if self._forms is None:
+ response = self._response
+ response.seek(0)
+ try:
+ self._forms = self._forms_factory.parse_response(response)
+ finally:
+ response.seek(0)
+ return self._forms
+
+ def viewing_html(self):
+ """Return whether the current response contains HTML data."""
+ if self._response is None:
+ raise BrowserStateError("not viewing any document")
+ ct_hdrs = self._response.info().getheaders("content-type")
+ url = self._response.geturl()
+ return is_html(ct_hdrs, url)
+
+ def title(self):
+ """Return title, or None if there is no title element in the document.
+
+ Tags are stripped or textified as described in docs for
+ PullParser.get_text() method of pullparser module.
+
+ """
+ import pullparser
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ if self._title is None:
+ p = pullparser.TolerantPullParser(
+ self._response, encoding=self._encoding(self._response))
+ try:
+ p.get_tag("title")
+ except pullparser.NoMoreTokensError:
+ pass
+ else:
+ self._title = p.get_text()
+ return self._title
+
+ def select_form(self, name=None, predicate=None, nr=None):
+ """Select an HTML form for input.
+
+ This is a bit like giving a form the "input focus" in a browser.
+
+ If a form is selected, the object supports the HTMLForm interface, so
+ you can call methods like .set_value(), .set(), and .click().
+
+ At least one of the name, predicate and nr arguments must be supplied.
+ If no matching form is found, mechanize.FormNotFoundError is raised.
+
+ If name is specified, then the form must have the indicated name.
+
+ If predicate is specified, then the form must match that function. The
+ predicate function is passed the HTMLForm as its single argument, and
+ should return a boolean value indicating whether the form matched.
+
+ nr, if supplied, is the sequence number of the form (where 0 is the
+ first). Note that control 0 is the first form matching all the other
+ arguments (if supplied); it is not necessarily the first control in the
+ form.
+
+ """
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ if (name is None) and (predicate is None) and (nr is None):
+ raise ValueError(
+ "at least one argument must be supplied to specify form")
+
+ orig_nr = nr
+ for form in self.forms():
+ if name is not None and name != form.name:
+ continue
+ if predicate is not None and not predicate(form):
+ continue
+ if nr:
+ nr -= 1
+ continue
+ self.form = form
+ break # success
+ else:
+ # failure
+ description = []
+ if name is not None: description.append("name '%s'" % name)
+ if predicate is not None:
+ description.append("predicate %s" % predicate)
+ if orig_nr is not None: description.append("nr %d" % orig_nr)
+ description = ", ".join(description)
+ raise FormNotFoundError("no form matching "+description)
+
+ def _add_referer_header(self, request, origin_request=True):
+ if self.request is None:
+ return request
+ scheme = request.get_type()
+ original_scheme = self.request.get_type()
+ if scheme not in ["http", "https"]:
+ return request
+ if not origin_request and not self.request.has_header('Referer'):
+ return request
+
+ if (self._handle_referer and
+ original_scheme in ["http", "https"] and
+ not (original_scheme == "https" and scheme != "https")):
+ # strip URL fragment (RFC 2616 14.36)
+ parts = urlparse.urlparse(self.request.get_full_url())
+ parts = parts[:-1]+("",)
+ referer = urlparse.urlunparse(parts)
+ request.add_unredirected_header("Referer", referer)
+ return request
+
+ def click(self, *args, **kwds):
+ """See ClientForm.HTMLForm.click for documentation."""
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ request = self.form.click(*args, **kwds)
+ return self._add_referer_header(request)
+
+ def submit(self, *args, **kwds):
+ """Submit current form.
+
+ Arguments are as for ClientForm.HTMLForm.click().
+
+ Return value is same as for Browser.open().
+
+ """
+ return self.open(self.click(*args, **kwds))
+
+ def click_link(self, link=None, **kwds):
+ """Find a link and return a Request object for it.
+
+ Arguments are as for .find_link(), except that a link may be supplied
+ as the first argument.
+
+ """
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ if not link:
+ link = self.find_link(**kwds)
+ else:
+ if kwds:
+ raise ValueError(
+ "either pass a Link, or keyword arguments, not both")
+ request = self.request_class(link.absolute_url)
+ return self._add_referer_header(request)
+
+ def follow_link(self, link=None, **kwds):
+ """Find a link and .open() it.
+
+ Arguments are as for .click_link().
+
+ Return value is same as for Browser.open().
+
+ """
+ return self.open(self.click_link(link, **kwds))
+
+ def find_link(self, **kwds):
+ """Find a link in current page.
+
+ Links are returned as mechanize.Link objects.
+
+ # Return third link that .search()-matches the regexp "python"
+ # (by ".search()-matches", I mean that the regular expression method
+ # .search() is used, rather than .match()).
+ find_link(text_regex=re.compile("python"), nr=2)
+
+ # Return first http link in the current page that points to somewhere
+ # on python.org whose link text (after tags have been removed) is
+ # exactly "monty python".
+ find_link(text="monty python",
+ url_regex=re.compile("http.*python.org"))
+
+ # Return first link with exactly three HTML attributes.
+ find_link(predicate=lambda link: len(link.attrs) == 3)
+
+ Links include anchors (<a>), image maps (<area>), and frames (<frame>,
+ <iframe>).
+
+ All arguments must be passed by keyword, not position. Zero or more
+ arguments may be supplied. In order to find a link, all arguments
+ supplied must match.
+
+ If a matching link is not found, mechanize.LinkNotFoundError is raised.
+
+ text: link text between link tags: eg. <a href="blah">this bit</a> (as
+ returned by pullparser.get_compressed_text(), ie. without tags but
+ with opening tags "textified" as per the pullparser docs) must compare
+ equal to this argument, if supplied
+ text_regex: link text between tag (as defined above) must match the
+ regular expression object passed as this argument, if supplied
+ name, name_regex: as for text and text_regex, but matched against the
+ name HTML attribute of the link tag
+ url, url_regex: as for text and text_regex, but matched against the
+ URL of the link tag (note this matches against Link.url, which is a
+ relative or absolute URL according to how it was written in the HTML)
+ tag: element name of opening tag, eg. "a"
+ predicate: a function taking a Link object as its single argument,
+ returning a boolean result, indicating whether the links
+ nr: matches the nth link that matches all other criteria (default 0)
+
+ """
+ return self._find_links(True, **kwds)
+
+ def __getattr__(self, name):
+ # pass through ClientForm / DOMForm methods and attributes
+ form = self.__dict__.get("form")
+ if form is None:
+ raise AttributeError(
+ "%s instance has no attribute %s (perhaps you forgot to "
+ ".select_form()?)" % (self.__class__, name))
+ return getattr(form, name)
+
+#---------------------------------------------------
+# Private methods.
+
+ def _find_links(self, single,
+ text=None, text_regex=None,
+ name=None, name_regex=None,
+ url=None, url_regex=None,
+ tag=None,
+ predicate=None,
+ nr=0
+ ):
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+
+ found_links = []
+ orig_nr = nr
+
+ # An optimization, so that if we look for a single link we do not have
+ # to necessarily parse the entire file.
+ if self._links is None and single:
+ all_links = self.get_links_iter()
+ else:
+ if self._links is None:
+ try:
+ self._links = list(self.get_links_iter())
+ finally:
+ self._response.seek(0)
+ all_links = self._links
+
+ for link in all_links:
+ if url is not None and url != link.url:
+ continue
+ if url_regex is not None and not url_regex.search(link.url):
+ continue
+ if (text is not None and
+ (link.text is None or text != link.text)):
+ continue
+ if (text_regex is not None and
+ (link.text is None or not text_regex.search(link.text))):
+ continue
+ if name is not None and name != dict(link.attrs).get("name"):
+ continue
+ if name_regex is not None:
+ link_name = dict(link.attrs).get("name")
+ if link_name is None or not name_regex.search(link_name):
+ continue
+ if tag is not None and tag != link.tag:
+ continue
+ if predicate is not None and not predicate(link):
+ continue
+ if nr:
+ nr -= 1
+ continue
+ if single:
+ return link
+ else:
+ found_links.append(link)
+ nr = orig_nr
+ if not found_links:
+ raise LinkNotFoundError()
+ return found_links
+
+ def _encoding(self, response):
+ # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
+ # headers may be in the response. HTTP-EQUIV headers come last,
+ # so try in order from first to last.
+ for ct in response.info().getheaders("content-type"):
+ for k, v in split_header_words([ct])[0]:
+ if k == "charset":
+ return v
+ return self.default_encoding
+
+ def _parse_html(self, response):
+ # this is now lazy, so we just reset the various attributes that
+ # result from parsing
+ self.form = None
+ self._title = None
+ self._forms = self._links = None
Added: Zope3/trunk/src/mechanize/_useragent.py
===================================================================
--- Zope3/trunk/src/mechanize/_useragent.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/mechanize/_useragent.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,309 @@
+"""Convenient HTTP UserAgent class.
+
+This is a subclass of urllib2.OpenerDirector.
+
+
+Copyright 2003 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import sys
+import urllib2, httplib
+import ClientCookie
+if sys.version_info[:2] >= (2, 4):
+ import cookielib
+ from urllib2 import OpenerDirector, BaseHandler, \
+ HTTPHandler, HTTPSHandler, HTTPErrorProcessor
+ class SaneHTTPCookieProcessor(ClientCookie.HTTPCookieProcessor):
+ # Workaround for RFC 2109 bug http://python.org/sf/1157027 (at least if
+ # you don't pass your own CookieJar in: if that's the case, you should
+ # pass rfc2965=True to the DefaultCookiePolicy constructor yourself, or
+ # set the corresponding attribute).
+ def __init__(self, cookiejar=None):
+ if cookiejar is None:
+ cookiejar = cookielib.CookieJar(
+ cookielib.DefaultCookiePolicy(rfc2965=True))
+ self.cookiejar = cookiejar
+ HTTPCookieProcessor = SaneHTTPCookieProcessor
+else:
+ from ClientCookie import OpenerDirector, BaseHandler, \
+ HTTPHandler, HTTPSHandler, HTTPErrorProcessor, HTTPCookieProcessor
+
+class HTTPRefererProcessor(BaseHandler):
+ def http_request(self, request):
+ # See RFC 2616 14.36. The only times we know the source of the
+ # request URI has a URI associated with it are redirect, and
+ # Browser.click() / Browser.submit() / Browser.follow_link().
+ # Otherwise, it's the user's job to add any Referer header before
+ # .open()ing.
+ if hasattr(request, "redirect_dict"):
+ request = self.parent._add_referer_header(
+ request, origin_request=False)
+ return request
+
+ https_request = http_request
+
+
+class UserAgent(OpenerDirector):
+ """Convenient user-agent class.
+
+ Do not use .add_handler() to add a handler for something already dealt with
+ by this code.
+
+ Public attributes:
+
+ addheaders: list of (name, value) pairs specifying headers to send with
+ every request, unless they are overridden in the Request instance.
+
+ >>> ua = UserAgent()
+ >>> ua.addheaders = [
+ ... ("User-agent", "Mozilla/5.0 (compatible)"),
+ ... ("From", "responsible.person at example.com")]
+
+ """
+
+ handler_classes = {
+ # scheme handlers
+ "http": HTTPHandler,
+ "ftp": urllib2.FTPHandler, # CacheFTPHandler is buggy in 2.3
+ "file": urllib2.FileHandler,
+ "gopher": urllib2.GopherHandler,
+ # XXX etc.
+
+ # other handlers
+ "_unknown": urllib2.UnknownHandler,
+ # HTTP{S,}Handler depend on HTTPErrorProcessor too
+ "_http_error": HTTPErrorProcessor,
+ "_http_request_upgrade": ClientCookie.HTTPRequestUpgradeProcessor,
+ "_http_default_error": urllib2.HTTPDefaultErrorHandler,
+
+ # feature handlers
+ "_authen": urllib2.HTTPBasicAuthHandler,
+ # XXX rest of authentication stuff
+ "_redirect": ClientCookie.HTTPRedirectHandler,
+ "_cookies": HTTPCookieProcessor,
+ "_refresh": ClientCookie.HTTPRefreshProcessor,
+ "_referer": HTTPRefererProcessor, # from this module, note
+ "_equiv": ClientCookie.HTTPEquivProcessor,
+ "_seek": ClientCookie.SeekableProcessor,
+ "_proxy": urllib2.ProxyHandler,
+ # XXX there's more to proxies, too
+
+ # debug handlers
+ "_debug_redirect": ClientCookie.HTTPRedirectDebugProcessor,
+ "_debug_response_body": ClientCookie.HTTPResponseDebugProcessor,
+ }
+
+ default_schemes = ["http", "ftp", "file", "gopher"]
+ default_others = ["_unknown", "_http_error", "_http_request_upgrade",
+ "_http_default_error"]
+ default_features = ["_authen", "_redirect", "_cookies", "_refresh",
+ "_referer", "_equiv", "_seek", "_proxy"]
+ if hasattr(httplib, 'HTTPS'):
+ handler_classes["https"] = HTTPSHandler
+ default_schemes.append("https")
+ if hasattr(ClientCookie, "HTTPRobotRulesProcessor"):
+ handler_classes["_robots"] = ClientCookie.HTTPRobotRulesProcessor
+ default_features.append("_robots")
+
+ def __init__(self):
+ OpenerDirector.__init__(self)
+
+ self._ua_handlers = {}
+ for scheme in (self.default_schemes+
+ self.default_others+
+ self.default_features):
+ klass = self.handler_classes[scheme]
+ self._ua_handlers[scheme] = klass()
+ for handler in self._ua_handlers.itervalues():
+ self.add_handler(handler)
+
+ # Ensure correct default constructor args were passed to
+ # HTTPRefererProcessor and HTTPEquivProcessor. Yuck.
+ if '_refresh' in self._ua_handlers:
+ self.set_handle_refresh(True)
+ if '_equiv' in self._ua_handlers:
+ self.set_handle_equiv(True)
+
+ # special case, requires extra support from mechanize.Browser
+ self._handle_referer = True
+
+ def close(self):
+ OpenerDirector.close(self)
+ self._ua_handlers = None
+
+ # XXX
+## def set_timeout(self, timeout):
+## self._timeout = timeout
+## def set_http_connection_cache(self, conn_cache):
+## self._http_conn_cache = conn_cache
+## def set_ftp_connection_cache(self, conn_cache):
+## # XXX ATM, FTP has cache as part of handler; should it be separate?
+## self._ftp_conn_cache = conn_cache
+
+ def set_handled_schemes(self, schemes):
+ """Set sequence of protocol scheme strings.
+
+ If this fails (with ValueError) because you've passed an unknown
+ scheme, the set of handled schemes WILL be updated, but schemes in the
+ list that come after the unknown scheme won't be handled.
+
+ """
+ want = {}
+ for scheme in schemes:
+ if scheme.startswith("_"):
+ raise ValueError("invalid scheme '%s'" % scheme)
+ want[scheme] = None
+
+ # get rid of scheme handlers we don't want
+ for scheme, oldhandler in self._ua_handlers.items():
+ if scheme.startswith("_"): continue # not a scheme handler
+ if scheme not in want:
+ self._replace_handler(scheme, None)
+ else:
+ del want[scheme] # already got it
+ # add the scheme handlers that are missing
+ for scheme in want.keys():
+ if scheme not in self.handler_classes:
+ raise ValueError("unknown scheme '%s'")
+ self._set_handler(scheme, True)
+
+ def _add_referer_header(self, request, origin_request=True):
+ raise NotImplementedError(
+ "this class can't do HTTP Referer: use mechanize.Browser instead")
+
+ def set_cookiejar(self, cookiejar):
+ """Set a ClientCookie.CookieJar, or None."""
+ self._set_handler("_cookies", obj=cookiejar)
+ def set_credentials(self, credentials):
+ """Set a urllib2.HTTPPasswordMgr, or None."""
+ # XXX use Greg Stein's httpx instead?
+ self._set_handler("_authen", obj=credentials)
+
+ # these methods all take a boolean parameter
+ def set_handle_robots(self, handle):
+ """Set whether to observe rules from robots.txt."""
+ self._set_handler("_robots", handle)
+ def set_handle_redirect(self, handle):
+ """Set whether to handle HTTP 30x redirections."""
+ self._set_handler("_redirect", handle)
+ def set_handle_refresh(self, handle, max_time=None, honor_time=True):
+ """Set whether to handle HTTP Refresh headers."""
+ self._set_handler("_refresh", handle, constructor_kwds=
+ {"max_time": max_time, "honor_time": honor_time})
+ def set_handle_equiv(self, handle, head_parser_class=None):
+ """Set whether to treat HTML http-equiv headers like HTTP headers.
+
+ Response objects will be .seek()able if this is set.
+
+ """
+ if head_parser_class is not None:
+ constructor_kwds = {"head_parser_class": head_parser_class}
+ else:
+ constructor_kwds={}
+ self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
+ def set_handle_referer(self, handle):
+ """Set whether to add Referer header to each request.
+
+ This base class does not implement this feature (so don't turn this on
+ if you're using this base class directly), but the subclass
+ mechanize.Browser does.
+
+ """
+ self._set_handler("_referer", handle)
+ self._handle_referer = True
+ def set_seekable_responses(self, handle):
+ """Make response objects .seek()able."""
+ self._set_handler("_seek", handle)
+ def set_debug_redirects(self, handle):
+ """Log information about HTTP redirects.
+
+ This includes refreshes, which show up as faked 302 redirections at the
+ moment.
+
+ Logs is performed using module logging. The logger name is
+ "ClientCookie.http_redirects". To actually print some debug output,
+ eg:
+
+ logger = logging.getLogger("ClientCookie.http_redirects")
+ logger.addHandler(logging.StreamHandler())
+ logger.setLevel(logging.INFO)
+
+ Other logger names relevant to this module:
+
+ "ClientCookie.http_responses"
+ "ClientCookie.cookies" (or "cookielib" if running Python 2.4)
+
+ To turn on everything:
+
+ for logger in [
+ logging.getLogger("ClientCookie"),
+ logging.getLogger("cookielib"),
+ ]:
+ logger.addHandler(logging.StreamHandler())
+ logger.setLevel(logging.INFO)
+
+ """
+ self._set_handler("_debug_redirect", handle)
+ def set_debug_responses(self, handle):
+ """Log HTTP response bodies.
+
+ See docstring for .set_debug_redirects() for details of logging.
+
+ """
+ self._set_handler("_debug_response_body", handle)
+ def set_debug_http(self, handle):
+ """Print HTTP headers to sys.stdout."""
+ level = int(bool(handle))
+ for scheme in "http", "https":
+ h = self._ua_handlers.get(scheme)
+ if h is not None:
+ h.set_http_debuglevel(level)
+
+ def _set_handler(self, name, handle=None, obj=None,
+ constructor_args=(), constructor_kwds={}):
+ if handle is None:
+ handle = obj is not None
+ if handle:
+ handler_class = self.handler_classes[name]
+ if obj is not None:
+ newhandler = handler_class(obj)
+ else:
+ newhandler = handler_class(*constructor_args, **constructor_kwds)
+ else:
+ newhandler = None
+ self._replace_handler(name, newhandler)
+
+ def _replace_handler(self, name, newhandler=None):
+ # first, if handler was previously added, remove it
+ if name is not None:
+ try:
+ handler = self._ua_handlers[name]
+ except:
+ pass
+ else:
+ for table in (
+ [self.handle_open,
+ self.process_request, self.process_response]+
+ self.handle_error.values()):
+ for handlers in table.values():
+ remove(handlers, handler)
+ remove(self.handlers, handler)
+ # then add the replacement, if any
+ if newhandler is not None:
+ self.add_handler(newhandler)
+ self._ua_handlers[name] = newhandler
+
+def remove(sequence, obj):
+ # for use when can't use .remove() because of obj.__cmp__ :-(
+ # (ClientCookie only requires Python 2.0, which doesn't have __lt__)
+ i = 0
+ while i < len(sequence):
+ if sequence[i] is obj:
+ del sequence[i]
+ else:
+ i += 1
Added: Zope3/trunk/src/pullparser.py
===================================================================
--- Zope3/trunk/src/pullparser.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/pullparser.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -0,0 +1,350 @@
+"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
+
+Examples
+
+This program extracts all links from a document. It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+for token in p.tags("a"):
+ if token.type == "endtag": continue
+ url = dict(token.attrs).get("href", "-")
+ text = p.get_compressed_text(endat=("endtag", "a"))
+ print "%s\t%s" % (url, text)
+
+This program extracts the <TITLE> from the document:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+if p.get_tag("title"):
+ title = p.get_compressed_text()
+ print "Title: %s" % title
+
+
+Copyright 2003-2004 John J. Lee <jjl at pobox.com>
+Copyright 1998-2001 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License.
+
+"""
+
+from __future__ import generators
+
+import re, htmlentitydefs
+import HTMLParser
+
+__version__ = (0, 0, 6, None, None) # 0.0.6b
+
+class NoMoreTokensError(Exception): pass
+
+class Token:
+ """Represents an HTML tag, declaration, processing instruction etc.
+
+ Behaves as both a tuple-like object (ie. iterable) and has attributes
+ .type, .data and .attrs.
+
+ >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
+ >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
+ True
+ >>> t.type, t.data == "starttag", "a"
+ True
+ >>> t.attrs == [("href", "http://www.python.org/")]
+ True
+
+ Public attributes
+
+ type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
+ "data", "comment", "decl", "pi", after the corresponding methods of
+ HTMLParser.HTMLParser
+ data: For a tag, the tag name; otherwise, the relevant data carried by the
+ tag, as a string
+ attrs: list of (name, value) pairs representing HTML attributes
+ (or None if token does not represent an opening tag)
+
+ """
+ def __init__(self, type, data, attrs=None):
+ self.type = type
+ self.data = data
+ self.attrs = attrs
+ def __iter__(self):
+ return iter((self.type, self.data, self.attrs))
+ def __eq__(self, other):
+ type, data, attrs = other
+ if (self.type == type and
+ self.data == data and
+ self.attrs == attrs):
+ return True
+ else:
+ return False
+ def __ne__(self, other): return not self.__eq__(other)
+ def __repr__(self):
+ args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
+ return self.__class__.__name__+"(%s)" % args
+
+def iter_until_exception(fn, exception, *args, **kwds):
+ while 1:
+ try:
+ yield fn(*args, **kwds)
+ except exception:
+ raise StopIteration
+
+def caller():
+ try:
+ raise SyntaxError
+ except:
+ import sys
+ return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
+
+def unescape(data, entities):
+ if data is None or '&' not in data:
+ return data
+ def replace_entities(match):
+ ent = match.group()
+ repl = entities.get(ent, ent)
+ return repl
+ return re.sub(r'&\S+?;', replace_entities, data)
+
+def get_entitydefs():
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ entitydefs["&%s;" % name] = char
+ return entitydefs
+
+
+class _AbstractParser:
+ chunk = 1024
+ compress_re = re.compile(r"\s+")
+ entitydefs = htmlentitydefs.entitydefs
+ def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
+ encoding="ascii", entitydefs=None):
+ """
+ fh: file-like object (only a .read() method is required) from which to
+ read HTML to be parsed
+ textify: mapping used by .get_text() and .get_compressed_text() methods
+ to represent opening tags as text
+ encoding: encoding used to encode numeric character references by
+ .get_text() and .get_compressed_text() ("ascii" by default)
+ entitydefs: mapping like {'&': '&', ...} containing HTML entity
+ definitions (a sensible default is used)
+
+ If the element name of an opening tag matches a key in the textify
+ mapping then that tag is converted to text. The corresponding value is
+ used to specify which tag attribute to obtain the text from. textify
+ maps from element names to either:
+
+ - an HTML attribute name, in which case the HTML attribute value is
+ used as its text value along with the element name in square
+ brackets (eg."alt text goes here[IMG]", or, if the alt attribute
+ were missing, just "[IMG]")
+ - a callable object (eg. a function) which takes a Token and returns
+ the string to be used as its text value
+
+ If textify has no key for an element name, nothing is substituted for
+ the opening tag.
+
+ Public attributes:
+
+ encoding and textify: see above
+
+ """
+ self._fh = fh
+ self._tokenstack = [] # FIFO
+ self.textify = textify
+ self.encoding = encoding
+ if entitydefs is None:
+ entitydefs = get_entitydefs()
+ self._entitydefs = entitydefs
+
+ def __iter__(self): return self
+
+ def tags(self, *names):
+ return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
+
+ def tokens(self, *tokentypes):
+ return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
+
+ def next(self):
+ try:
+ return self.get_token()
+ except NoMoreTokensError:
+ raise StopIteration()
+
+ def get_token(self, *tokentypes):
+ """Pop the next Token object from the stack of parsed tokens.
+
+ If arguments are given, they are taken to be token types in which the
+ caller is interested: tokens representing other elements will be
+ skipped. Element names must be given in lower case.
+
+ Raises NoMoreTokensError.
+
+ """
+ while 1:
+ while self._tokenstack:
+ token = self._tokenstack.pop(0)
+ if tokentypes:
+ if token.type in tokentypes:
+ return token
+ else:
+ return token
+ data = self._fh.read(self.chunk)
+ if not data:
+ raise NoMoreTokensError()
+ self.feed(data)
+
+ def unget_token(self, token):
+ """Push a Token back onto the stack."""
+ self._tokenstack.insert(0, token)
+
+ def get_tag(self, *names):
+ """Return the next Token that represents an opening or closing tag.
+
+ If arguments are given, they are taken to be element names in which the
+ caller is interested: tags representing other elements will be skipped.
+ Element names must be given in lower case.
+
+ Raises NoMoreTokensError.
+
+ """
+ while 1:
+ tok = self.get_token()
+ if tok.type not in ["starttag", "endtag", "startendtag"]:
+ continue
+ if names:
+ if tok.data in names:
+ return tok
+ else:
+ return tok
+
+ def get_text(self, endat=None):
+ """Get some text.
+
+ endat: stop reading text at this tag (the tag is included in the
+ returned text); endtag is a tuple (type, name) where type is
+ "starttag", "endtag" or "startendtag", and name is the element name of
+ the tag (element names must be given in lower case)
+
+ If endat is not given, .get_text() will stop at the next opening or
+ closing tag, or when there are no more tokens (no exception is raised).
+ Note that .get_text() includes the text representation (if any) of the
+ opening tag, but pushes the opening tag back onto the stack. As a
+ result, if you want to call .get_text() again, you need to call
+ .get_tag() first (unless you want an empty string returned when you
+ next call .get_text()).
+
+ Entity references are translated using the entitydefs attribute (a
+ mapping from names to characters like that provided by the standard
+ module htmlentitydefs). Named entity references that are not in this
+ mapping are left unchanged.
+
+ The textify attribute is used to translate opening tags into text: see
+ the class docstring.
+
+ """
+ text = []
+ tok = None
+ while 1:
+ try:
+ tok = self.get_token()
+ except NoMoreTokensError:
+ # unget last token (not the one we just failed to get)
+ if tok: self.unget_token(tok)
+ break
+ if tok.type == "data":
+ text.append(tok.data)
+ elif tok.type == "entityref":
+ name = tok.data
+ if name in self.entitydefs:
+ t = self.entitydefs[name]
+ else:
+ t = "&%s;" % name
+ text.append(t)
+ elif tok.type == "charref":
+ name, base = tok.data, 10
+ if name.startswith('x'):
+ name, base= name[1:], 16
+ t = unichr(int(name, base)).encode(self.encoding)
+ text.append(t)
+ elif tok.type in ["starttag", "endtag", "startendtag"]:
+ tag_name = tok.data
+ if tok.type in ["starttag", "startendtag"]:
+ alt = self.textify.get(tag_name)
+ if alt is not None:
+ if callable(alt):
+ text.append(alt(tok))
+ elif tok.attrs is not None:
+ for k, v in tok.attrs:
+ if k == alt:
+ text.append(v)
+ text.append("[%s]" % tag_name.upper())
+ if endat is None or endat == (tok.type, tag_name):
+ self.unget_token(tok)
+ break
+ return "".join(text)
+
+ def get_compressed_text(self, *args, **kwds):
+ """
+ As .get_text(), but collapses each group of contiguous whitespace to a
+ single space character, and removes all initial and trailing
+ whitespace.
+
+ """
+ text = self.get_text(*args, **kwds)
+ text = text.strip()
+ return self.compress_re.sub(" ", text)
+
+ def handle_startendtag(self, tag, attrs):
+ self._tokenstack.append(Token("startendtag", tag, attrs))
+ def handle_starttag(self, tag, attrs):
+ self._tokenstack.append(Token("starttag", tag, attrs))
+ def handle_endtag(self, tag):
+ self._tokenstack.append(Token("endtag", tag))
+ def handle_charref(self, name):
+ self._tokenstack.append(Token("charref", name))
+ def handle_entityref(self, name):
+ self._tokenstack.append(Token("entityref", name))
+ def handle_data(self, data):
+ self._tokenstack.append(Token("data", data))
+ def handle_comment(self, data):
+ self._tokenstack.append(Token("comment", data))
+ def handle_decl(self, decl):
+ self._tokenstack.append(Token("decl", decl))
+ def unknown_decl(self, data):
+ # XXX should this call self.error instead?
+ #self.error("unknown declaration: " + `data`)
+ self._tokenstack.append(Token("decl", data))
+ def handle_pi(self, data):
+ self._tokenstack.append(Token("pi", data))
+
+ def unescape_attr(self, name):
+ return unescape(name, self._entitydefs)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ escaped_attrs.append((key, self.unescape_attr(val)))
+ return escaped_attrs
+
+class PullParser(_AbstractParser, HTMLParser.HTMLParser):
+ def __init__(self, *args, **kwds):
+ HTMLParser.HTMLParser.__init__(self)
+ _AbstractParser.__init__(self, *args, **kwds)
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+import sgmllib
+class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
+ def __init__(self, *args, **kwds):
+ sgmllib.SGMLParser.__init__(self)
+ _AbstractParser.__init__(self, *args, **kwds)
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.unescape_attrs(attrs)
+ self._tokenstack.append(Token("starttag", tag, attrs))
+ def unknown_endtag(self, tag):
+ self._tokenstack.append(Token("endtag", tag))
Deleted: Zope3/trunk/src/zope/testbrowser/BUGFIXES.txt
===================================================================
--- Zope3/trunk/src/zope/testbrowser/BUGFIXES.txt 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/zope/testbrowser/BUGFIXES.txt 2005-11-01 17:34:14 UTC (rev 39818)
@@ -1,31 +0,0 @@
-===================
-Mechanize Bug Fixes
-===================
-
-This file contains bug fixes to the mechanize framework that have not yet been
-reported to mechanize authors.
-
-``mechanize``
--------------
-
-- The ``Browser.close()`` method fails to close the current response using
- ``Browser._response.close()``. If you are opening a lot of pages, then you
- will end up witha "too many files opnened" error eventually.
-
-- The ``update_history`` argument of ``Browser._mech_open()`` was unused.
-
-- In the ``Browser.close()`` method, the ``Browser._history`` attribute is set
- to ``None``, but is should be ``[]`` (an empty list).
-
-- Related to the entry above, when the history is reset, none of the stored
- responses is closed.
-
-- Due to the poor design of the history implementation, you can still get "too
- many files opened" if you use the rest of the ``Browser`` API. The ideal
- solution would be to close the previous response just before you use
- ``Browser._mech_open()``. In fact this method should be responsible of the
- cleanup.
-
-- URLs that are retrieved using pullparser might be unclean, meaning they
- might contain leading and trailing whitespace. The ``Link`` class should
- thus strip the base and link url.
Deleted: Zope3/trunk/src/zope/testbrowser/ClientForm.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/ClientForm.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/zope/testbrowser/ClientForm.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -1,3036 +0,0 @@
-"""HTML form handling for web clients.
-
-ClientForm is a Python module for handling HTML forms on the client
-side, useful for parsing HTML forms, filling them in and returning the
-completed forms to the server. It has developed from a port of Gisle
-Aas' Perl module HTML::Form, from the libwww-perl library, but the
-interface is not the same.
-
-The most useful docstring is the one for HTMLForm.
-
-RFC 1866: HTML 2.0
-RFC 1867: Form-based File Upload in HTML
-RFC 2388: Returning Values from Forms: multipart/form-data
-HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
-HTML 4.01 Specification, W3C Recommendation 24 December 1999
-
-
-Copyright 2002-2005 John J. Lee <jjl at pobox.com>
-Copyright 1998-2000 Gisle Aas.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included with
-the distribution).
-
-"""
-
-# XXX
-# Add some functional tests
-# Especially single and multiple file upload on the internet.
-# Does file upload work when name is missing? Sourceforge tracker form
-# doesn't like it. Check standards, and test with Apache. Test
-# binary upload with Apache.
-# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py.
-# Controls can have name=None (eg. forms constructed partly with
-# JavaScript), but find_control can't be told to find a control
-# with that name, because None there means 'unspecified'. Can still
-# get at by nr, but would be nice to be able to specify something
-# equivalent to name=None, too.
-# Support for list item ids. How to handle missing ids? (How do I deal
-# with duplicate OPTION labels ATM? Can't remember...)
-# Deal with character sets properly. Not sure what the issues are here.
-# Do URL encodings need any attention?
-# I don't *think* any encoding of control names, filenames or data is
-# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
-# doesn't seem to do it.
-# Add charset parameter to Content-type headers? How to find value??
-# I'm not going to fix this unless somebody tells me what real servers
-# that want this encoding actually expect: If enctype is
-# application/x-www-form-urlencoded and there's a FILE control present.
-# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
-# 17.13.2), but I send "name=" ATM. What about multiple file upload??
-# Get rid of MapBase, AList and MimeWriter.
-# Should really use sgmllib, not htmllib.
-# Factor out multiple-selection list code? May not be easy. Maybe like
-# this:
-
-# ListControl
-# ^
-# | MultipleListControlMixin
-# | ^
-# SelectControl /
-# ^ /
-# \ /
-# MultiSelectControl
-
-
-# Plan
-# ----
-# Maybe a 0.2.x, cleaned up a bit and with id support for list items?
-# Not sure it's worth it...
-# action should probably be an absolute URI, like DOMForm.
-# Replace by_label with choice between value / id / label /
-# element contents (see discussion with Gisle about labels on
-# libwww-perl list).
-# ...what else?
-# Work on DOMForm.
-# XForms? Don't know if there's a need here.
-
-
-try: True
-except NameError:
- True = 1
- False = 0
-
-try: bool
-except NameError:
- def bool(expr):
- if expr: return True
- else: return False
-
-import sys, urllib, urllib2, types, string, mimetools, copy, urlparse, \
- htmlentitydefs, re
-from urlparse import urljoin
-from cStringIO import StringIO
-try:
- from types import UnicodeType
-except ImportError:
- UNICODE = False
-else:
- UNICODE = True
-
-try:
- import warnings
-except ImportError:
- def deprecation(message):
- pass
-else:
- def deprecation(message):
- warnings.warn(message, DeprecationWarning, stacklevel=2)
-
-VERSION = "0.1.18"
-
-CHUNK = 1024 # size of chunks fed to parser, in bytes
-
-_compress_re = re.compile(r"\s+")
-compressText = lambda text: _compress_re.sub(' ', text.strip())
-
-# This version of urlencode is from my Python 1.5.2 back-port of the
-# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
-# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
-def urlencode(query,doseq=False,):
- """Encode a sequence of two-element tuples or dictionary into a URL query \
-string.
-
- If any values in the query arg are sequences and doseq is true, each
- sequence element is converted to a separate parameter.
-
- If the query arg is a sequence of two-element tuples, the order of the
- parameters in the output will match the order of parameters in the
- input.
- """
-
- if hasattr(query,"items"):
- # mapping objects
- query = query.items()
- else:
- # it's a bother at times that strings and string-like objects are
- # sequences...
- try:
- # non-sequence items should not work with len()
- x = len(query)
- # non-empty strings will fail this
- if len(query) and type(query[0]) != types.TupleType:
- raise TypeError()
- # zero-length sequences of all types will get here and succeed,
- # but that's a minor nit - since the original implementation
- # allowed empty dicts that type of behavior probably should be
- # preserved for consistency
- except TypeError:
- ty,va,tb = sys.exc_info()
- raise TypeError("not a valid non-string sequence or mapping "
- "object", tb)
-
- l = []
- if not doseq:
- # preserve old behavior
- for k, v in query:
- k = urllib.quote_plus(str(k))
- v = urllib.quote_plus(str(v))
- l.append(k + '=' + v)
- else:
- for k, v in query:
- k = urllib.quote_plus(str(k))
- if type(v) == types.StringType:
- v = urllib.quote_plus(v)
- l.append(k + '=' + v)
- elif UNICODE and type(v) == types.UnicodeType:
- # is there a reasonable way to convert to ASCII?
- # encode generates a string, but "replace" or "ignore"
- # lose information and "strict" can raise UnicodeError
- v = urllib.quote_plus(v.encode("ASCII","replace"))
- l.append(k + '=' + v)
- else:
- try:
- # is this a sufficient test for sequence-ness?
- x = len(v)
- except TypeError:
- # not a sequence
- v = urllib.quote_plus(str(v))
- l.append(k + '=' + v)
- else:
- # loop over the sequence
- for elt in v:
- l.append(k + '=' + urllib.quote_plus(str(elt)))
- return string.join(l, '&')
-
-def unescape(data, entities):
- if data is None or '&' not in data:
- return data
-
- def replace_entities(match):
- ent = match.group()
- repl = entities.get(ent, ent)
- return repl
-
- return re.sub(r'&\S+;', replace_entities, data)
-
-def startswith(string, initial):
- if len(initial) > len(string): return False
- return string[:len(initial)] == initial
-
-def issequence(x):
- try:
- x[0]
- except (TypeError, KeyError):
- return False
- except IndexError:
- pass
- return True
-
-def isstringlike(x):
- try: x+""
- except: return False
- else: return True
-
-
-# XXX don't really want to drag this along (MapBase, AList, MimeWriter,
-# _choose_boundary)
-
-# This is essentially the same as UserDict.DictMixin. I wrote this before
-# that, and DictMixin isn't available in 1.5.2 anyway.
-class MapBase:
- """Mapping designed to be easily derived from.
-
- Subclass it and override __init__, __setitem__, __getitem__, __delitem__
- and keys. Nothing else should need to be overridden, unlike UserDict.
- This significantly simplifies dictionary-like classes.
-
- Also different from UserDict in that it has a redonly flag, and can be
- updated (and initialised) with a sequence of pairs (key, value).
-
- """
- def __init__(self, init=None):
- self._data = {}
- self.readonly = False
- if init is not None: self.update(init)
-
- def __getitem__(self, key):
- return self._data[key]
-
- def __setitem__(self, key, item):
- if not self.readonly:
- self._data[key] = item
- else:
- raise TypeError("object doesn't support item assignment")
-
- def __delitem__(self, key):
- if not self.readonly:
- del self._data[key]
- else:
- raise TypeError("object doesn't support item deletion")
-
- def keys(self):
- return self._data.keys()
-
- # now the internal workings, there should be no need to override these:
-
- def clear(self):
- for k in self.keys():
- del self[k]
-
- def __repr__(self):
- rep = []
- for k, v in self.items():
- rep.append("%s: %s" % (repr(k), repr(v)))
- return self.__class__.__name__+"{"+(string.join(rep, ", "))+"}"
-
- def copy(self):
- return copy.copy(self)
-
- def __cmp__(self, dict):
- # note: return value is *not* boolean
- for k, v in self.items():
- if not (dict.has_key(k) and dict[k] == v):
- return 1 # different
- return 0 # the same
-
- def __len__(self):
- return len(self.keys())
-
- def values(self):
- r = []
- for k in self.keys():
- r.append(self[k])
- return r
-
- def items(self):
- keys = self.keys()
- vals = self.values()
- r = []
- for i in len(self):
- r.append((keys[i], vals[i]))
- return r
-
- def has_key(self, key):
- return key in self.keys()
-
- def update(self, map):
- if issequence(map) and not isstringlike(map):
- items = map
- else:
- items = map.items()
- for tup in items:
- if not isinstance(tup, TupleType):
- raise TypeError(
- "MapBase.update requires a map or a sequence of pairs")
- k, v = tup
- self[k] = v
-
- def get(self, key, failobj=None):
- if key in self.keys():
- return self[key]
- else:
- return failobj
-
- def setdefault(self, key, failobj=None):
- if not self.has_key(key):
- self[key] = failobj
- return self[key]
-
-
-class AList(MapBase):
- """Read-only ordered mapping."""
- def __init__(self, seq=[]):
- self.readonly = True
- self._inverted = False
- self._data = list(seq[:])
- self._keys = []
- self._values = []
- for key, value in seq:
- self._keys.append(key)
- self._values.append(value)
-
- def set_inverted(self, inverted):
- if (inverted and not self._inverted) or (
- not inverted and self._inverted):
- self._keys, self._values = self._values, self._keys
- if inverted: self._inverted = True
- else: self._inverted = False
-
- def __getitem__(self, key):
- try:
- i = self._keys.index(key)
- except ValueError:
- raise KeyError(key)
- return self._values[i]
-
- def __delitem__(self, key):
- try:
- i = self._keys.index[key]
- except ValueError:
- raise KeyError(key)
- del self._values[i]
-
- def keys(self): return list(self._keys[:])
- def values(self): return list(self._values[:])
- def items(self):
- data = self._data[:]
- if not self._inverted:
- return data
- else:
- newdata = []
- for k, v in data:
- newdata.append((v, k))
- return newdata
-
-# --------------------------------------------------------------------
-# grabbed from Python standard library mimetools module and tweaked to
-# avoid socket.gaierror
-try:
- import thread
- _thread = thread; del thread
-except ImportError:
- import dummy_thread
- _thread = dummy_thread; del dummy_thread
-_counter_lock = _thread.allocate_lock()
-del _thread
-
-_counter = 0
-def _get_next_counter():
- global _counter
- _counter_lock.acquire()
- _counter = _counter + 1
- result = _counter
- _counter_lock.release()
- return result
-
-_prefix = None
-
-def _choose_boundary():
- """Return a string usable as a multipart boundary.
-
- The string chosen is unique within a single program run, and
- incorporates the user id (if available), process id (if available),
- and current time. So it's very unlikely the returned string appears
- in message text, but there's no guarantee.
-
- The boundary contains dots so you have to quote it in the header."""
-
- global _prefix
- import time
- import os
- import socket
- if _prefix is None:
- try:
- socket.gaierror
- except AttributeError:
- exc = socket.error
- else:
- exc = socket.gaierror
-
- try:
- hostid = socket.gethostbyname(socket.gethostname())
- except exc:
- hostid = 'localhost'
- try:
- uid = repr(os.getuid())
- except AttributeError:
- uid = '1'
- try:
- pid = repr(os.getpid())
- except AttributeError:
- pid = '1'
- _prefix = hostid + '.' + uid + '.' + pid
- return "%s.%.3f.%d" % (_prefix, time.time(), _get_next_counter())
-
-# end of code from mimetools module
-# --------------------------------------------------------------------
-
-def choose_boundary():
- b = _choose_boundary()
- b = string.replace(b, ".", "")
- return b
-
-# This cut-n-pasted MimeWriter from standard library is here so can add
-# to HTTP headers rather than message body when appropriate. It also uses
-# \r\n in place of \n. This is nasty.
-class MimeWriter:
-
- """Generic MIME writer.
-
- Methods:
-
- __init__()
- addheader()
- flushheaders()
- startbody()
- startmultipartbody()
- nextpart()
- lastpart()
-
- A MIME writer is much more primitive than a MIME parser. It
- doesn't seek around on the output file, and it doesn't use large
- amounts of buffer space, so you have to write the parts in the
- order they should occur on the output file. It does buffer the
- headers you add, allowing you to rearrange their order.
-
- General usage is:
-
- f = <open the output file>
- w = MimeWriter(f)
- ...call w.addheader(key, value) 0 or more times...
-
- followed by either:
-
- f = w.startbody(content_type)
- ...call f.write(data) for body data...
-
- or:
-
- w.startmultipartbody(subtype)
- for each part:
- subwriter = w.nextpart()
- ...use the subwriter's methods to create the subpart...
- w.lastpart()
-
- The subwriter is another MimeWriter instance, and should be
- treated in the same way as the toplevel MimeWriter. This way,
- writing recursive body parts is easy.
-
- Warning: don't forget to call lastpart()!
-
- XXX There should be more state so calls made in the wrong order
- are detected.
-
- Some special cases:
-
- - startbody() just returns the file passed to the constructor;
- but don't use this knowledge, as it may be changed.
-
- - startmultipartbody() actually returns a file as well;
- this can be used to write the initial 'if you can read this your
- mailer is not MIME-aware' message.
-
- - If you call flushheaders(), the headers accumulated so far are
- written out (and forgotten); this is useful if you don't need a
- body part at all, e.g. for a subpart of type message/rfc822
- that's (mis)used to store some header-like information.
-
- - Passing a keyword argument 'prefix=<flag>' to addheader(),
- start*body() affects where the header is inserted; 0 means
- append at the end, 1 means insert at the start; default is
- append for addheader(), but insert for start*body(), which use
- it to determine where the Content-type header goes.
-
- """
-
- def __init__(self, fp, http_hdrs=None):
- self._http_hdrs = http_hdrs
- self._fp = fp
- self._headers = []
- self._boundary = []
- self._first_part = True
-
- def addheader(self, key, value, prefix=0,
- add_to_http_hdrs=0):
- """
- prefix is ignored if add_to_http_hdrs is true.
- """
- lines = string.split(value, "\r\n")
- while lines and not lines[-1]: del lines[-1]
- while lines and not lines[0]: del lines[0]
- if add_to_http_hdrs:
- value = string.join(lines, "")
- self._http_hdrs.append((key, value))
- else:
- for i in range(1, len(lines)):
- lines[i] = " " + string.strip(lines[i])
- value = string.join(lines, "\r\n") + "\r\n"
- line = key + ": " + value
- if prefix:
- self._headers.insert(0, line)
- else:
- self._headers.append(line)
-
- def flushheaders(self):
- self._fp.writelines(self._headers)
- self._headers = []
-
- def startbody(self, ctype=None, plist=[], prefix=1,
- add_to_http_hdrs=0, content_type=1):
- """
- prefix is ignored if add_to_http_hdrs is true.
- """
- if content_type and ctype:
- for name, value in plist:
- ctype = ctype + ';\r\n %s=%s' % (name, value)
- self.addheader("Content-type", ctype, prefix=prefix,
- add_to_http_hdrs=add_to_http_hdrs)
- self.flushheaders()
- if not add_to_http_hdrs: self._fp.write("\r\n")
- self._first_part = True
- return self._fp
-
- def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
- add_to_http_hdrs=0, content_type=1):
- boundary = boundary or choose_boundary()
- self._boundary.append(boundary)
- return self.startbody("multipart/" + subtype,
- [("boundary", boundary)] + plist,
- prefix=prefix,
- add_to_http_hdrs=add_to_http_hdrs,
- content_type=content_type)
-
- def nextpart(self):
- boundary = self._boundary[-1]
- if self._first_part:
- self._first_part = False
- else:
- self._fp.write("\r\n")
- self._fp.write("--" + boundary + "\r\n")
- return self.__class__(self._fp)
-
- def lastpart(self):
- if self._first_part:
- self.nextpart()
- boundary = self._boundary.pop()
- self._fp.write("\r\n--" + boundary + "--\r\n")
-
-
-class ControlNotFoundError(ValueError): pass
-class ItemNotFoundError(ValueError): pass
-class ItemCountError(ValueError): pass
-
-class ParseError(Exception): pass
-
-
-class _AbstractFormParser:
- """forms attribute contains HTMLForm instances on completion."""
- # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
- def __init__(self, entitydefs=None):
- if entitydefs is None:
- entitydefs = get_entitydefs()
- self._entitydefs = entitydefs
-
- self.base = None
- self.forms = []
- self.labels = []
- self._current_label = None
- self._current_form = None
- self._select = None
- self._optgroup = None
- self._option = None
- self._textarea = None
-
- def do_base(self, attrs):
- for key, value in attrs:
- if key == "href":
- self.base = value
-
- def end_body(self):
- if self._current_label is not None:
- self.end_label()
- if self._current_form is not None:
- self.end_form()
-
- def start_form(self, attrs):
- if self._current_form is not None:
- raise ParseError("nested FORMs")
- name = None
- action = None
- enctype = "application/x-www-form-urlencoded"
- method = "GET"
- d = {}
- for key, value in attrs:
- if key == "name":
- name = value
- elif key == "action":
- action = value
- elif key == "method":
- method = string.upper(value)
- elif key == "enctype":
- enctype = string.lower(value)
- d[key] = value
- controls = []
- self._current_form = (name, action, method, enctype), d, controls
-
- def end_form(self):
- if self._current_label is not None:
- self.end_label()
- if self._current_form is None:
- raise ParseError("end of FORM before start")
- self.forms.append(self._current_form)
- self._current_form = None
-
- def start_select(self, attrs):
- if self._current_form is None:
- raise ParseError("start of SELECT before start of FORM")
- if self._select is not None:
- raise ParseError("nested SELECTs")
- if self._textarea is not None:
- raise ParseError("SELECT inside TEXTAREA")
- d = {}
- d.update(attrs)
-
- self._select = d
- self._add_label(d)
-
- self._append_select_control({"__select": d})
-
- def end_select(self):
- if self._current_form is None:
- raise ParseError("end of SELECT before start of FORM")
- if self._select is None:
- raise ParseError("end of SELECT before start")
-
- if self._option is not None:
- self._end_option()
-
- self._select = None
-
- def start_optgroup(self, attrs):
- if self._select is None:
- raise ParseError("OPTGROUP outside of SELECT")
- d = {}
- d.update(attrs)
-
- self._optgroup = d
-
- def end_optgroup(self):
- if self._optgroup is None:
- raise ParseError("end of OPTGROUP before start")
- self._optgroup = None
-
- def _start_option(self, attrs):
- if self._select is None:
- raise ParseError("OPTION outside of SELECT")
- if self._option is not None:
- self._end_option()
-
- self._option = {}
- self._option.update(attrs)
- if (self._optgroup and self._optgroup.has_key("disabled") and
- not self._option.has_key("disabled")):
- self._option["disabled"] = None
-
- def _end_option(self):
- if self._option is None:
- raise ParseError("end of OPTION before start")
-
- contents = string.strip(self._option.get("contents", ""))
- self._option["contents"] = contents
- if not self._option.has_key("value"):
- self._option["value"] = contents
- if not self._option.has_key("label"):
- self._option["label"] = contents
- # stuff dict of SELECT HTML attrs into a special private key
- # (gets deleted again later)
- self._option["__select"] = self._select
- self._append_select_control(self._option)
- self._option = None
-
- def _append_select_control(self, attrs):
- controls = self._current_form[2]
- name = self._select.get("name")
- controls.append(("select", name, attrs))
-
- def start_textarea(self, attrs):
- if self._current_form is None:
- raise ParseError("start of TEXTAREA before start of FORM")
- if self._textarea is not None:
- raise ParseError("nested TEXTAREAs")
- if self._select is not None:
- raise ParseError("TEXTAREA inside SELECT")
- d = {}
- d.update(attrs)
- self._add_label(d)
-
- self._textarea = d
-
- def end_textarea(self):
- if self._current_form is None:
- raise ParseError("end of TEXTAREA before start of FORM")
- if self._textarea is None:
- raise ParseError("end of TEXTAREA before start")
- controls = self._current_form[2]
- name = self._textarea.get("name")
- controls.append(("textarea", name, self._textarea))
- self._textarea = None
-
- def start_label(self, attrs):
- if self._current_label:
- self.end_label()
- attrs = dict(attrs)
- taken = bool(attrs.get('for')) # empty id is invalid
- attrs['__text'] = ''
- attrs['__taken'] = taken
- if taken:
- self.labels.append(attrs)
- self._current_label = attrs
-
- def end_label(self):
- label = self._current_label
- if label is None:
- # something is ugly in the HTML, but we're ignoring it
- return
- self._current_label = None
- label['__text'] = label['__text']
- del label['__taken'] # if it is staying around, it is True in all cases
-
- def _add_label(self, d):
- if self._current_label is not None:
- if self._current_label['__taken']:
- self.end_label() # be fuzzy
- else:
- self._current_label['__taken'] = True
- d['__label'] = self._current_label
-
- def handle_data(self, data):
- if self._option is not None:
- # self._option is a dictionary of the OPTION element's HTML
- # attributes, but it has two special keys, one of which is the
- # special "contents" key contains text between OPTION tags (the
- # other is the "__select" key: see the end_option method)
- map = self._option
- key = "contents"
- elif self._textarea is not None:
- map = self._textarea
- key = "value"
- elif self._current_label is not None: # not if within option or
- # textarea
- map = self._current_label
- key = "__text"
- else:
- return
-
- if not map.has_key(key):
- map[key] = data
- else:
- map[key] = map[key] + data
-
- def do_button(self, attrs):
- if self._current_form is None:
- raise ParseError("start of BUTTON before start of FORM")
- d = {}
- d["type"] = "submit" # default
- d.update(attrs)
- controls = self._current_form[2]
-
- type = d["type"]
- name = d.get("name")
- # we don't want to lose information, so use a type string that
- # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
- # eg. type for BUTTON/RESET is "resetbutton"
- # (type for INPUT/RESET is "reset")
- type = type+"button"
- self._add_label(d)
- controls.append((type, name, d))
-
- def do_input(self, attrs):
- if self._current_form is None:
- raise ParseError("start of INPUT before start of FORM")
- d = {}
- d["type"] = "text" # default
- d.update(attrs)
- controls = self._current_form[2]
-
- type = d["type"]
- name = d.get("name")
- self._add_label(d)
- controls.append((type, name, d))
-
- def do_isindex(self, attrs):
- if self._current_form is None:
- raise ParseError("start of ISINDEX before start of FORM")
- d = {}
- d.update(attrs)
- controls = self._current_form[2]
-
- self._add_label(d)
- # isindex doesn't have type or name HTML attributes
- controls.append(("isindex", None, d))
-
- def handle_entityref(self, name):
- table = self._entitydefs
- fullname = '&%s;' % name
- if table.has_key(fullname):
- self.handle_data(table[fullname])
- else:
- self.unknown_entityref(name)
- return
-
- def unescape_attr(self, name):
- return unescape(name, self._entitydefs)
-
- def unescape_attrs(self, attrs):
- escaped_attrs = {}
- for key, val in attrs.items():
- try:
- val.items
- except AttributeError:
- escaped_attrs[key] = self.unescape_attr(val)
- else:
- # eg. "__select" -- yuck!
- escaped_attrs[key] = self.unescape_attrs(val)
- return escaped_attrs
-
- def unknown_entityref(self, ref): self.handle_data('&%s;' % ref)
- def unknown_charref(self, ref): self.handle_data('&#%s;' % ref)
-
-
-# HTMLParser.HTMLParser is recent, so live without it if it's not available
-# (also, htmllib.HTMLParser is much more tolerant of bad HTML)
-try:
- import HTMLParser
-except ImportError:
- class XHTMLCompatibleFormParser:
- def __init__(self, entitydefs=None):
- raise ValueError("HTMLParser could not be imported")
-else:
- class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
- """Good for XHTML, bad for tolerance of incorrect HTML."""
- # thanks to Michael Howitz for this!
- def __init__(self, entitydefs=None):
- HTMLParser.HTMLParser.__init__(self)
- _AbstractFormParser.__init__(self, entitydefs)
-
- def start_option(self, attrs):
- _AbstractFormParser._start_option(self, attrs)
-
- def end_option(self):
- _AbstractFormParser._end_option(self)
-
- def handle_starttag(self, tag, attrs):
- try:
- method = getattr(self, 'start_' + tag)
- except AttributeError:
- try:
- method = getattr(self, 'do_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method(attrs)
- else:
- method(attrs)
-
- def handle_endtag(self, tag):
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method()
-
- # taken from sgmllib, with changes
- def handle_charref(self, name):
- try:
- n = int(name)
- except ValueError:
- self.unknown_charref(name)
- return
- if not 0 <= n <= 255:
- self.unknown_charref(name)
- return
- self.handle_data(chr(n))
-
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
-
- def unescape_attr_if_required(self, name):
- return name # HTMLParser.HTMLParser already did it
- def unescape_attrs_if_required(self, attrs):
- return attrs # ditto
-
-import htmllib, formatter
-class FormParser(_AbstractFormParser, htmllib.HTMLParser):
- """Good for tolerance of incorrect HTML, bad for XHTML."""
- def __init__(self, entitydefs=None):
- htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
- _AbstractFormParser.__init__(self, entitydefs)
-
- def do_option(self, attrs):
- _AbstractFormParser._start_option(self, attrs)
-
- def unescape_attr_if_required(self, name):
- return self.unescape_attr(name)
- def unescape_attrs_if_required(self, attrs):
- return self.unescape_attrs(attrs)
-
-#FormParser = XHTMLCompatibleFormParser # testing hack
-
-def get_entitydefs():
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- entitydefs["&%s;" % name] = char
- return entitydefs
-
-def ParseResponse(response, select_default=False,
- ignore_errors=False, # ignored!
- form_parser_class=FormParser,
- request_class=urllib2.Request,
- entitydefs=None):
- """Parse HTTP response and return a list of HTMLForm instances.
-
- The return value of urllib2.urlopen can be conveniently passed to this
- function as the response parameter.
-
- ClientForm.ParseError is raised on parse errors.
-
- response: file-like object (supporting read() method) with a method
- geturl(), returning the URI of the HTTP response
- select_default: for multiple-selection SELECT controls and RADIO controls,
- pick the first item as the default if none are selected in the HTML
- form_parser_class: class to instantiate and use to pass
- request_class: class to return from .click() method (default is
- urllib2.Request)
- entitydefs: mapping like {'&': '&', ...} containing HTML entity
- definitions (a sensible default is used)
-
- Pass a true value for select_default if you want the behaviour specified by
- RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
- RADIO or multiple-selection SELECT control if none were selected in the
- HTML. Most browsers (including Microsoft Internet Explorer (IE) and
- Netscape Navigator) instead leave all items unselected in these cases. The
- W3C HTML 4.0 standard leaves this behaviour undefined in the case of
- multiple-selection SELECT controls, but insists that at least one RADIO
- button should be checked at all times, in contradiction to browser
- behaviour.
-
- There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
- HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
- htmllib.HTMLParser) (the default) works best for ordinary grubby HTML.
- Note that HTMLParser is only available in Python 2.2 and later. You can
- pass your own class in here as a hack to work around bad HTML, but at your
- own risk: there is no well-defined interface.
-
- """
- return ParseFile(response, response.geturl(), select_default,
- False,
- form_parser_class,
- request_class,
- entitydefs)
-
-def ParseFile(file, base_uri, select_default=False,
- ignore_errors=False, # ignored!
- form_parser_class=FormParser,
- request_class=urllib2.Request,
- entitydefs=None):
- """Parse HTML and return a list of HTMLForm instances.
-
- ClientForm.ParseError is raised on parse errors.
-
- file: file-like object (supporting read() method) containing HTML with zero
- or more forms to be parsed
- base_uri: the URI of the document (note that the base URI used to submit
- the form will be that given in the BASE element if present, not that of
- the document)
-
- For the other arguments and further details, see ParseResponse.__doc__.
-
- """
- fp = form_parser_class(entitydefs)
- while 1:
- data = file.read(CHUNK)
- try:
- fp.feed(data)
- except ParseError, e:
- e.base_uri = base_uri
- raise
- if len(data) != CHUNK: break
- if fp.base is not None:
- # HTML BASE element takes precedence over document URI
- base_uri = fp.base
- labels = [] # Label(label) for label in fp.labels]
- id_to_labels = {}
- for l in fp.labels:
- label = Label(l)
- labels.append(label)
- for_id = l['for']
- coll = id_to_labels.get(for_id)
- if coll is None:
- id_to_labels[for_id] = [label]
- else:
- coll.append(label)
- forms = []
- for (name, action, method, enctype), attrs, controls in fp.forms:
- if action is None:
- action = base_uri
- else:
- action = urljoin(base_uri, action)
- action = fp.unescape_attr_if_required(action)
- name = fp.unescape_attr_if_required(name)
- attrs = fp.unescape_attrs_if_required(attrs)
- form = HTMLForm( # would be nice to make class (form builder) pluggable
- action, method, enctype, name, attrs, request_class,
- forms, labels, id_to_labels)
- for type, name, attrs in controls:
- attrs = fp.unescape_attrs_if_required(attrs)
- name = fp.unescape_attr_if_required(name)
- form.new_control(type, name, attrs, select_default=select_default)
- forms.append(form)
- for form in forms:
- form.fixup()
- return forms
-
-class Label(object):
- def __init__(self, attrs):
- self.id = attrs.get('for')
- self.text = compressText(attrs.get('__text'))
- self.attrs = attrs
-
-def _getLabel(attrs):
- label = attrs.get('__label')
- if label is not None:
- label = Label(label)
- return label
-
-class Control:
- """An HTML form control.
-
- An HTMLForm contains a sequence of Controls. HTMLForm delegates lots of
- things to Control objects, and most of Control's methods are, in effect,
- documented by the HTMLForm docstrings.
-
- The Controls in an HTMLForm can be got at via the HTMLForm.find_control
- method or the HTMLForm.controls attribute.
-
- Control instances are usually constructed using the ParseFile /
- ParseResponse functions, so you can probably ignore the rest of this
- paragraph. A Control is only properly initialised after the fixup method
- has been called. In fact, this is only strictly necessary for ListControl
- instances. This is necessary because ListControls are built up from
- ListControls each containing only a single item, and their initial value(s)
- can only be known after the sequence is complete.
-
- The types and values that are acceptable for assignment to the value
- attribute are defined by subclasses.
-
- If the disabled attribute is true, this represents the state typically
- represented by browsers by `greying out' a control. If the disabled
- attribute is true, the Control will raise AttributeError if an attempt is
- made to change its value. In addition, the control will not be considered
- `successful' as defined by the W3C HTML 4 standard -- ie. it will
- contribute no data to the return value of the HTMLForm.click* methods. To
- enable a control, set the disabled attribute to a false value.
-
- If the readonly attribute is true, the Control will raise AttributeError if
- an attempt is made to change its value. To make a control writable, set
- the readonly attribute to a false value.
-
- All controls have the disabled and readonly attributes, not only those that
- may have the HTML attributes of the same names.
-
- On assignment to the value attribute, the following exceptions are raised:
- TypeError, AttributeError (if the value attribute should not be assigned
- to, because the control is disabled, for example) and ValueError.
-
- If the name or value attributes are None, or the value is an empty list, or
- if the control is disabled, the control is not successful.
-
- Public attributes:
-
- type: string describing type of control (see the keys of the
- HTMLForm.type2class dictionary for the allowable values) (readonly)
- name: name of control (readonly)
- value: current value of control (subclasses may allow a single value, a
- sequence of values, or either)
- disabled: disabled state
- readonly: readonly state
- id: value of id HTML attribute
-
- """
- def __init__(self, type, name, attrs):
- """
- type: string describing type of control (see the keys of the
- HTMLForm.type2class dictionary for the allowable values)
- name: control name
- attrs: HTML attributes of control's HTML element
-
- """
- raise NotImplementedError()
-
- def add_to_form(self, form):
- self._form = form
- form.controls.append(self)
-
- def fixup(self):
- pass
-
- def is_of_kind(self, kind):
- raise NotImplementedError()
-
- def clear(self):
- raise NotImplementedError()
-
- def __getattr__(self, name): raise NotImplementedError()
- def __setattr__(self, name, value): raise NotImplementedError()
-
- def pairs(self):
- """Return list of (key, value) pairs suitable for passing to urlencode.
- """
- raise NotImplementedError()
-
- def _write_mime_data(self, mw):
- """Write data for this control to a MimeWriter."""
- # called by HTMLForm
- for name, value in self.pairs():
- mw2 = mw.nextpart()
- mw2.addheader("Content-disposition",
- 'form-data; name="%s"' % name, 1)
- f = mw2.startbody(prefix=0)
- f.write(value)
-
- def __str__(self):
- raise NotImplementedError()
-
- def getLabels(self):
- res = []
- if self._label:
- res.append(self._label)
- if self.id:
- res.extend(self._form._id_to_labels.get(self.id, ()))
- return res
-
-
-#---------------------------------------------------
-class ScalarControl(Control):
- """Control whose value is not restricted to one of a prescribed set.
-
- Some ScalarControls don't accept any value attribute. Otherwise, takes a
- single value, which must be string-like.
-
- Additional read-only public attribute:
-
- attrs: dictionary mapping the names of original HTML attributes of the
- control to their values
-
- """
- def __init__(self, type, name, attrs):
- self._label = _getLabel(attrs)
- self.__dict__["type"] = string.lower(type)
- self.__dict__["name"] = name
- self._value = attrs.get("value")
- self.disabled = attrs.has_key("disabled")
- self.readonly = attrs.has_key("readonly")
- self.id = attrs.get("id")
-
- self.attrs = attrs.copy()
-
- self._clicked = False
-
- def __getattr__(self, name):
- if name == "value":
- return self.__dict__["_value"]
- else:
- raise AttributeError("%s instance has no attribute '%s'" %
- (self.__class__.__name__, name))
-
- def __setattr__(self, name, value):
- if name == "value":
- if not isstringlike(value):
- raise TypeError("must assign a string")
- elif self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- elif self.disabled:
- raise AttributeError("control '%s' is disabled" % self.name)
- self.__dict__["_value"] = value
- elif name in ("name", "type"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
-
- def pairs(self):
- name = self.name
- value = self.value
- if name is None or value is None or self.disabled:
- return []
- return [(name, value)]
-
- def clear(self):
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- self.__dict__["_value"] = None
-
- def __str__(self):
- name = self.name
- value = self.value
- if name is None: name = "<None>"
- if value is None: value = "<None>"
-
- infos = []
- if self.disabled: infos.append("disabled")
- if self.readonly: infos.append("readonly")
- info = string.join(infos, ", ")
- if info: info = " (%s)" % info
-
- return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
-
-
-#---------------------------------------------------
-class TextControl(ScalarControl):
- """Textual input control.
-
- Covers:
-
- INPUT/TEXT
- INPUT/PASSWORD
- INPUT/FILE
- INPUT/HIDDEN
- TEXTAREA
-
- """
- def __init__(self, type, name, attrs):
- ScalarControl.__init__(self, type, name, attrs)
- if self.type == "hidden": self.readonly = True
- if self._value is None:
- self._value = ""
-
- def is_of_kind(self, kind): return kind == "text"
-
-#---------------------------------------------------
-class FileControl(ScalarControl):
- """File upload with INPUT TYPE=FILE.
-
- The value attribute of a FileControl is always None. Use add_file instead.
-
- Additional public method: add_file
-
- """
-
- def __init__(self, type, name, attrs):
- ScalarControl.__init__(self, type, name, attrs)
- self._value = None
- self._upload_data = []
-
- def is_of_kind(self, kind): return kind == "file"
-
- def clear(self):
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- self._upload_data = []
-
- def __setattr__(self, name, value):
- if name in ("value", "name", "type"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
-
- def add_file(self, file_object, content_type=None, filename=None):
- if not hasattr(file_object, "read"):
- raise TypeError("file-like object must have read method")
- if content_type is not None and not isstringlike(content_type):
- raise TypeError("content type must be None or string-like")
- if filename is not None and not isstringlike(filename):
- raise TypeError("filename must be None or string-like")
- if content_type is None:
- content_type = "application/octet-stream"
- self._upload_data.append((file_object, content_type, filename))
-
- def pairs(self):
- # XXX should it be successful even if unnamed?
- if self.name is None or self.disabled:
- return []
- return [(self.name, "")]
-
- def _write_mime_data(self, mw):
- # called by HTMLForm
- if len(self._upload_data) == 1:
- # single file
- file_object, content_type, filename = self._upload_data[0]
- mw2 = mw.nextpart()
- fn_part = filename and ('; filename="%s"' % filename) or ''
- disp = 'form-data; name="%s"%s' % (self.name, fn_part)
- mw2.addheader("Content-disposition", disp, prefix=1)
- fh = mw2.startbody(content_type, prefix=0)
- fh.write(file_object.read())
- elif len(self._upload_data) != 0:
- # multiple files
- mw2 = mw.nextpart()
- disp = 'form-data; name="%s"' % self.name
- mw2.addheader("Content-disposition", disp, prefix=1)
- fh = mw2.startmultipartbody("mixed", prefix=0)
- for file_object, content_type, filename in self._upload_data:
- mw3 = mw2.nextpart()
- fn_part = filename and ('; filename="%s"' % filename) or ''
- disp = 'file%s' % fn_part
- mw3.addheader("Content-disposition", disp, prefix=1)
- fh2 = mw3.startbody(content_type, prefix=0)
- fh2.write(file_object.read())
- mw2.lastpart()
-
- def __str__(self):
- name = self.name
- if name is None: name = "<None>"
-
- if not self._upload_data:
- value = "<No files added>"
- else:
- value = []
- for file, ctype, filename in self._upload_data:
- if filename is None:
- value.append("<Unnamed file>")
- else:
- value.append(filename)
- value = string.join(value, ", ")
-
- info = []
- if self.disabled: info.append("disabled")
- if self.readonly: info.append("readonly")
- info = string.join(info, ", ")
- if info: info = " (%s)" % info
-
- return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
-
-
-#---------------------------------------------------
-class IsindexControl(ScalarControl):
- """ISINDEX control.
-
- ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
- part of regular HTML forms at all, and predates it. You're only allowed
- one ISINDEX per HTML document. ISINDEX and regular form submission are
- mutually exclusive -- either submit a form, or the ISINDEX.
-
- Having said this, since ISINDEX controls may appear in forms (which is
- probably bad HTML), ParseFile / ParseResponse will include them in the
- HTMLForm instances it returns. You can set the ISINDEX's value, as with
- any other control (but note that ISINDEX controls have no name, so you'll
- need to use the type argument of set_value!). When you submit the form,
- the ISINDEX will not be successful (ie., no data will get returned to the
- server as a result of its presence), unless you click on the ISINDEX
- control, in which case the ISINDEX gets submitted instead of the form:
-
- form.set_value("my isindex value", type="isindex")
- urllib2.urlopen(form.click(type="isindex"))
-
- ISINDEX elements outside of FORMs are ignored. If you want to submit one
- by hand, do it like so:
-
- url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
- result = urllib2.urlopen(url)
-
- """
- def __init__(self, type, name, attrs):
- ScalarControl.__init__(self, type, name, attrs)
- if self._value is None:
- self._value = ""
-
- def is_of_kind(self, kind): return kind in ["text", "clickable"]
-
- def pairs(self):
- return []
-
- def _click(self, form, coord, return_type, request_class=urllib2.Request):
- # Relative URL for ISINDEX submission: instead of "foo=bar+baz",
- # want "bar+baz".
- # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
- # deprecated in 4.01, but it should still say how to submit it).
- # Submission of ISINDEX is explained in the HTML 3.2 spec, though.
- parts = urlparse.urlparse(form.action)
- rest, (query, frag) = parts[:-2], parts[-2:]
- parts = rest + (urllib.quote_plus(self.value), "")
- url = urlparse.urlunparse(parts)
- req_data = url, None, []
-
- if return_type == "pairs":
- return []
- elif return_type == "request_data":
- return req_data
- else:
- return request_class(url)
-
- def __str__(self):
- value = self.value
- if value is None: value = "<None>"
-
- infos = []
- if self.disabled: infos.append("disabled")
- if self.readonly: infos.append("readonly")
- info = string.join(infos, ", ")
- if info: info = " (%s)" % info
-
- return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
-
-
-#---------------------------------------------------
-class IgnoreControl(ScalarControl):
- """Control that we're not interested in.
-
- Covers:
-
- INPUT/RESET
- BUTTON/RESET
- INPUT/BUTTON
- BUTTON/BUTTON
-
- These controls are always unsuccessful, in the terminology of HTML 4 (ie.
- they never require any information to be returned to the server).
-
- BUTTON/BUTTON is used to generate events for script embedded in HTML.
-
- The value attribute of IgnoreControl is always None.
-
- """
- def __init__(self, type, name, attrs):
- ScalarControl.__init__(self, type, name, attrs)
- self._value = None
-
- def is_of_kind(self, kind): return False
-
- def __setattr__(self, name, value):
- if name == "value":
- raise AttributeError(
- "control '%s' is ignored, hence read-only" % self.name)
- elif name in ("name", "type"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
-
-
-#---------------------------------------------------
-# ListControls
-
-# helpers and subsidiary classes
-
-class Item(object):
- def __init__(self, control, attrs):
- label = _getLabel(attrs)
- self.__dict__.update({
- 'value': attrs['value'],
- '_labels': label and [label] or [],
- 'attrs': attrs,
- 'control': control,
- '_disabled': attrs.has_key("disabled"),
- '_selected': False,
- 'id': attrs.get('id'),
- })
-
- def getLabels(self):
- res = []
- res.extend(self._labels)
- if self.id:
- res.extend(self.control._form._id_to_labels.get(self.id, ()))
- return res
-
- # selected and disabled properties
- def __getattr__(self, name):
- if name=='selected':
- return self._selected
- elif name=='disabled':
- return self._disabled
- raise AttributeError(name)
-
- def __setattr__(self, name, value):
- if name == 'selected':
- if bool(value) != bool(self._selected):
- self.control._set_selected_state(self, value)
- elif name == 'disabled':
- if bool(value) != bool(self._disabled):
- self.control._set_item_disabled(self, value)
- else:
- raise AttributeError(name)
-
- def __str__(self):
- res = self.value
- if self.selected:
- res = '*' + res
- if self.disabled:
- res = '(%s)' % res
- return res
-
- def __repr__(self):
- return "<%s value=%r id=%r>" % (
- self.__class__.__name__, self.value, self.id)
-
-# how to remove items from a list container: delete them as usual
-# ("del control.items[:]", for instance).
-# how to add items to a list container: instantiate Item with control, and add
-# to list ("control.items.append(Item(control, {...attrs...}))", for instance).
-# You never want an item to have an incorrect reference to its control (and
-# thus you never want an item to be in more than one control).
-
-class AmbiguityError(Exception):
- pass
-
-def disambiguate(items, count, value):
- if not items:
- raise ItemNotFoundError(value)
- if count is None:
- if len(items) > 1:
- raise AmbiguityError(value)
- return items[0]
- else:
- return items[count]
-
-class ListControl(Control):
- """Control representing a sequence of items.
-
- The value attribute of a ListControl represents the selected list items in
- the control.
-
- ListControl implements both list controls that take a single value and
- those that take multiple values.
-
- ListControls accept sequence values only. Some controls only accept
- sequences of length 0 or 1 (RADIO, and single-selection SELECT).
- In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
- and multiple-selection SELECTs (those having the "multiple" HTML attribute)
- accept sequences of any length.
-
- Note the following mistake:
-
- control.value = some_value
- assert control.value == some_value # not necessarily true
-
- The reason for this is that the value attribute always gives the list items
- in the order they were listed in the HTML.
-
- ListControl items can also be referred to by their labels instead of names.
- Use the by_label argument, and the set_value_by_label, get_value_by_label
- methods.
-
- Note that, rather confusingly, though SELECT controls are represented in
- HTML by SELECT elements (which contain OPTION elements, representing
- individual list items), CHECKBOXes and RADIOs are not represented by *any*
- element. Instead, those controls are represented by a collection of INPUT
- elements. For example, this is a SELECT control, named "control1":
-
- <select name="control1">
- <option>foo</option>
- <option value="1">bar</option>
- </select>
-
- and this is a CHECKBOX control, named "control2":
-
- <input type="checkbox" name="control2" value="foo" id="cbe1">
- <input type="checkbox" name="control2" value="bar" id="cbe2">
-
- The id attribute of a CHECKBOX or RADIO ListControl is always that of its
- first element (for example, "cbe1" above).
-
-
- Additional read-only public attribute: multiple.
-
- """
-
- # ListControls are built up by the parser from their component items by
- # creating one ListControl per item, consolidating them into a single
- # master ListControl held by the HTMLForm:
-
- # -User calls form.new_control(...)
- # -Form creates Control, and calls control.add_to_form(self).
- # -Control looks for a Control with the same name and type in the form,
- # and if it finds one, merges itself with that control by calling
- # control.merge_control(self). The first Control added to the form, of
- # a particular name and type, is the only one that survives in the
- # form.
- # -Form calls control.fixup for all its controls. ListControls in the
- # form know they can now safely pick their default values.
-
- # To create a ListControl without an HTMLForm, use:
-
- # control.merge_control(new_control)
-
- # (actually, it's much easier just to use ParseFile)
-
- _label = None
-
- def __init__(self, type, name, attrs={}, select_default=False,
- called_as_base_class=False):
- """
- select_default: for RADIO and multiple-selection SELECT controls, pick
- the first item as the default if no 'selected' HTML attribute is
- present
-
- """
- if not called_as_base_class:
- raise NotImplementedError()
-
- self.__dict__["type"] = string.lower(type)
- self.__dict__["name"] = name
- self._value = attrs.get("value")
- self.disabled = False
- self.readonly = False
- self.id = attrs.get("id")
-
- # As Controls are merged in with .merge_control(), self.attrs will
- # refer to each Control in turn -- always the most recently merged
- # control. Each merged-in Control instance corresponds to a single
- # list item: see ListControl.__doc__.
- self.items = []
-
- self._select_default = select_default
- self._clicked = False
-
- def clear(self):
- self.value = []
-
- def is_of_kind(self, kind):
- if kind == "list":
- return True
- elif kind == "multilist":
- return bool(self.multiple)
- elif kind == "singlelist":
- return not self.multiple
- else:
- return False
-
- def items_from_label(self, label, exclude_disabled=False):
- if not isstringlike(label): # why not isinstance basestring?
- raise TypeError("item label must be string-like")
- # check all labels on the items, then if any of the values have
- # an id, go through all the collected labels on self._form._labels and
- # see if any of them match.
- items = [] # order is important
- mapping = self._form._id_to_labels
- for o in self.items:
- if not exclude_disabled or not o.disabled:
- for l in o.getLabels():
- if label in l.text:
- items.append(o)
- break
- return items
-
- def items_from_value(self, value, exclude_disabled=False):
- if not isstringlike(value):
- raise TypeError("item value must be string-like")
- return [o for o in self.items if
- o.value == value and (not exclude_disabled or not o.disabled)]
-
- def get(self, name, by_label=False, count=None, exclude_disabled=False):
- if by_label:
- method = self.items_from_label
- else:
- method = self.items_from_value
- return disambiguate(method(name, exclude_disabled), count, name)
-
- def toggle(self, name, by_label=False, count=None):
- deprecation(
- "item = control.get(...); item.selected = not item.selected")
- o = self.get(name, by_label, count)
- self._set_selected_state(o, not o.selected)
-
- def set(self, selected, name, by_label=False, count=None):
- deprecation(
- "control.get(...).selected = <boolean>")
- self._set_selected_state(self.get(name, by_label, count), selected)
-
- def _set_selected_state(self, item, action):
- """
- index: index of item
- action:
- bool False: off
- bool True: on
- """
- if self.disabled:
- raise AttributeError("control '%s' is disabled" % self.name)
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- action == bool(action)
- if item.disabled:
- # I'd prefer ValueError
- raise AttributeError("item is disabled")
- elif action != item.selected:
- if self.multiple:
- item.__dict__['_selected'] = action
- else:
- if not action:
- item.__dict__['_selected'] = action
- else:
- selected = [o for o in self.items
- if o.selected and not o.disabled]
- # disabled items are not changeable but also
- # not 'successful': their values should not be sent to
- # the server, so they are effectively invisible,
- # whether or not the control considers itself to be
- # selected
- for s in selected:
- s.__dict__['_selected'] = False
- item.__dict__['_selected'] = True
-
- def toggle_single(self, by_label=None):
- deprecation(
- "control.items[0].selected = not control.items[0].selected")
- if len(self.items) != 1:
- raise ItemCountError(
- "'%s' is not a single-item control" % self.name)
- item = self.items[0]
- self._set_selected_state(item, not item.selected)
-
- def set_single(self, selected, by_label=None):
- deprecation(
- "control.items[0].selected = <boolean>")
- if len(self.items) != 1:
- raise ItemCountError(
- "'%s' is not a single-item control" % self.name)
- self._set_selected_state(self.items[0], selected)
-
- def get_item_disabled(self, name, by_label=False, count=None):
- """Get disabled state of named list item in a ListControl."""
- deprecation(
- "control.get(...).disabled")
- return self.get(name, by_label, count).disabled
-
- def set_item_disabled(self, disabled, name, by_label=False, count=None):
- """Set disabled state of named list item in a ListControl.
-
- disabled: boolean disabled state
-
- """
- deprecation(
- "control.get(...).disabled = <boolean>")
- self.get(name, by_label, count).disabled = disabled
-
- def _set_item_disabled(self, item, disabled):
- if not self.multiple and item.selected and self.value:
- item.__dict__['_selected'] = False
- item.__dict__['_disabled'] = bool(disabled)
-
- def set_all_items_disabled(self, disabled):
- """Set disabled state of all list items in a ListControl.
-
- disabled: boolean disabled state
-
- """
- disabled = bool(disabled)
- if not self.multiple: # make sure that re-emerging items don't
- # make single-choice controls insane
- value = bool(self.value)
- for o in self.items:
- if not disabled and o.disabled:
- o.__dict__['_disabled'] = disabled
- if not self.multiple and o.selected:
- if value:
- o.selected = False
- else:
- value = True
- else:
- o.__dict__['_disabled'] = disabled
- else:
- for o in self.items:
- o.__dict__['_disabled'] = disabled
-
- def get_item_attrs(self, name, by_label=False, count=None):
- """Return dictionary of HTML attributes for a single ListControl item.
-
- The HTML element types that describe list items are: OPTION for SELECT
- controls, INPUT for the rest. These elements have HTML attributes that
- you may occasionally want to know about -- for example, the "alt" HTML
- attribute gives a text string describing the item (graphical browsers
- usually display this as a tooltip).
-
- The returned dictionary maps HTML attribute names to values. The names
- and values are taken from the original HTML.
- """
- deprecation(
- "control.get(...).attrs")
- return self.get(name, by_label, count).attrs
-
- def add_to_form(self, form):
- self._form = form
- try:
- control = form.find_control(self.name, self.type)
- except ControlNotFoundError:
- Control.add_to_form(self, form)
- else:
- control.merge_control(self)
-
- def merge_control(self, control):
- assert bool(control.multiple) == bool(self.multiple)
- assert isinstance(control, self.__class__)
- self.items.extend(control.items)
-
- def fixup(self):
- """
- ListControls are built up from component list items (which are also
- ListControls) during parsing. This method should be called after all
- items have been added. See ListControl.__doc__ for the reason this is
- required.
-
- """
- # Need to set default selection where no item was indicated as being
- # selected by the HTML:
-
- # CHECKBOX:
- # Nothing should be selected.
- # SELECT/single, SELECT/multiple and RADIO:
- # RFC 1866 (HTML 2.0): says first item should be selected.
- # W3C HTML 4.01 Specification: says that client behaviour is
- # undefined in this case. For RADIO, exactly one must be selected,
- # though which one is undefined.
- # Both Netscape and Microsoft Internet Explorer (IE) choose first
- # item for SELECT/single. However, both IE5 and Mozilla (both 1.0
- # and Firebird 0.6) leave all items unselected for RADIO and
- # SELECT/multiple.
-
- # Since both Netscape and IE all choose the first item for
- # SELECT/single, we do the same. OTOH, both Netscape and IE
- # leave SELECT/multiple with nothing selected, in violation of RFC 1866
- # (but not in violation of the W3C HTML 4 standard); the same is true
- # of RADIO (which *is* in violation of the HTML 4 standard). We follow
- # RFC 1866 if the _select_default attribute is set, and Netscape and IE
- # otherwise. RFC 1866 and HTML 4 are always violated insofar as you
- # can deselect all items in a RadioControl.
-
- for o in self.items:
- # set items' controls to self, now that we've merged
- o.__dict__['control'] = self
-
- def __getattr__(self, name):
- if name == "value":
- return [o.value for o in self.items if
- not o.disabled and o.selected]
- else:
- raise AttributeError("%s instance has no attribute '%s'" %
- (self.__class__.__name__, name))
-
- def __setattr__(self, name, value):
- if name == "value":
- if self.disabled:
- raise AttributeError("control '%s' is disabled" % self.name)
- if self.readonly:
- raise AttributeError("control '%s' is readonly" % self.name)
- self._set_value(value)
- elif name in ("name", "type", "multiple"):
- raise AttributeError("%s attribute is readonly" % name)
- else:
- self.__dict__[name] = value
-
- def _set_value(self, value):
- if value is None or isstringlike(value):
- raise TypeError("ListControl, must set a sequence")
- if not value:
- for o in self.items:
- if not o.disabled:
- o.selected = False
- elif self.multiple:
- self._multiple_set_value(value)
- elif len(value) > 1:
- raise ItemCountError(
- "single selection list, must set sequence of "
- "length 0 or 1")
- else:
- self._single_set_value(value)
-
- def _get_items(self, value, target=1):
- all_items = self.items_from_value(value)
- items = [o for o in all_items if not o.disabled]
- if len(items) < target:
- if len(all_items) < target:
- raise ItemNotFoundError(
- "insufficient items with value %r" % value)
- else:
- raise AttributeError('disabled item with value %s' % value)
- on = []
- off = []
- for o in items:
- if o.selected:
- on.append(o)
- else:
- off.append(o)
- return on, off
-
- def _single_set_value(self, value):
- on, off = self._get_items(value[0])
- if not on:
- off[0].selected = True
-
- def _multiple_set_value(self, value):
- turn_on = [] # transactional-ish
- turn_off = [o for o in self.items if o.selected and not o.disabled]
- values = {}
- for v in value:
- if v in values:
- values[v] += 1
- else:
- values[v] = 1
- for value, count in values.items():
- on, off = self._get_items(value, count)
- for i in range(count):
- if on:
- o = on[0]
- del on[0]
- del turn_off[turn_off.index(o)]
- else:
- o = off[0]
- del off[0]
- turn_on.append(o)
- for o in turn_off:
- o.selected = False
- for o in turn_on:
- o.selected = True
-
- def set_value_by_label(self, value):
- if isinstance(value, (str, unicode)):
- raise TypeError(value)
- items = []
- for v in value:
- found = self.items_from_label(v)
- if len(found) > 1:
- # ambiguous labels are fine as long as values are same
- opt_value = found[0].value
- if [o for o in found[1:] if o != opt_value]:
- raise AmbiguityError(v)
- for o in found: # for the multiple-item case, we could try to
- # be smarter, saving them up and trying to resolve, but that's
- # too much.
- if o not in items:
- items.append(o)
- break
- else: # all of them are used
- raise ItemNotFoundError(v)
- # now we have all the items that should be on
- # let's just turn everything off and then back on.
- self.value = []
- for o in items:
- o.selected = True
-
- def get_value_by_label(self):
- res = []
- for o in self.items:
- if not o.disabled and o.selected:
- for l in o.getLabels():
- if l.text:
- res.append(l.text)
- break
- else:
- res.append(None)
- return res
-
- def possible_items(self, by_label=False): # disabled are not possible
- deprecation(
- "[o.value for o in self.items]")
- if by_label:
- res = []
- for o in self.items:
- for l in o.getLabels():
- if l.text:
- res.append(l.text)
- break
- else:
- res.append(None)
- return res
- return [o.value for o in self.items]
-
- def pairs(self):
- if self.disabled:
- return []
- else:
- return [(self.name, o.value) for o in self.items
- if o.selected and not o.disabled]
-
- def __str__(self):
- name = self.name
- if name is None: name = "<None>"
-
- display = [str(o) for o in self.items]
-
- infos = []
- if self.disabled: infos.append("disabled")
- if self.readonly: infos.append("readonly")
- info = string.join(infos, ", ")
- if info: info = " (%s)" % info
-
- return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
- name, string.join(display, ", "), info)
-
-
-class RadioControl(ListControl):
- """
- Covers:
-
- INPUT/RADIO
-
- """
- def __init__(self, type, name, attrs, select_default=False):
- attrs.setdefault('value', 'on')
- ListControl.__init__(self, type, name, attrs, select_default,
- called_as_base_class=True)
- self.__dict__["multiple"] = False
- o = Item(self, attrs)
- o.__dict__['_selected'] = attrs.has_key("checked")
- self.items.append(o)
-
- def fixup(self):
- ListControl.fixup(self)
- found = [o for o in self.items if o.selected and not o.disabled]
- if not found:
- if self._select_default:
- for o in self.items:
- if not o.disabled:
- o.selected = True
- break
- else: # eliminate any duplicate selected. Choose the last one.
- for o in found[:-1]:
- o.selected = False
-
- def getLabels(self):
- return []
-
-class CheckboxControl(ListControl):
- """
- Covers:
-
- INPUT/CHECKBOX
-
- """
- def __init__(self, type, name, attrs, select_default=False):
- attrs.setdefault('value', 'on')
- ListControl.__init__(self, type, name, attrs, select_default,
- called_as_base_class=True)
- self.__dict__["multiple"] = True
- o = Item(self, attrs)
- o.__dict__['_selected'] = attrs.has_key("checked")
- self.items.append(o)
-
- def getLabels(self):
- return []
-
-
-class SelectControl(ListControl):
- """
- Covers:
-
- SELECT (and OPTION)
-
- SELECT control values and labels are subject to some messy defaulting
- rules. For example, if the HTML representation of the control is:
-
- <SELECT name=year>
- <OPTION value=0 label="2002">current year</OPTION>
- <OPTION value=1>2001</OPTION>
- <OPTION>2000</OPTION>
- </SELECT>
-
- The items, in order, have labels "2002", "2001" and "2000", whereas their
- values are "0", "1" and "2000" respectively. Note that the value of the
- last OPTION in this example defaults to its contents, as specified by RFC
- 1866, as do the labels of the second and third OPTIONs.
-
- The OPTION labels are sometimes more meaningful than the OPTION values,
- which can make for more maintainable code.
-
- Additional read-only public attribute: attrs
-
- The attrs attribute is a dictionary of the original HTML attributes of the
- SELECT element. Other ListControls do not have this attribute, because in
- other cases the control as a whole does not correspond to any single HTML
- element. The get_item_attrs method may be used as usual to get at the
- HTML attributes of the HTML elements corresponding to individual list items
- (for SELECT controls, these are OPTION elements).
-
- Another special case is that the attributes dictionaries returned by
- get_item_attrs have a special key "contents" which does not correspond to
- any real HTML attribute, but rather contains the contents of the OPTION
- element:
-
- <OPTION>this bit</OPTION>
-
- """
- # HTML attributes here are treated slightly from other list controls:
- # -The SELECT HTML attributes dictionary is stuffed into the OPTION
- # HTML attributes dictionary under the "__select" key.
- # -The content of each OPTION element is stored under the special
- # "contents" key of the dictionary.
- # After all this, the dictionary is passed to the SelectControl constructor
- # as the attrs argument, as usual. However:
- # -The first SelectControl constructed when building up a SELECT control
- # has a constructor attrs argument containing only the __select key -- so
- # this SelectControl represents an empty SELECT control.
- # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
- # the __select dictionary containing the SELECT HTML-attributes.
-
- def __init__(self, type, name, attrs, select_default=False):
- # fish out the SELECT HTML attributes from the OPTION HTML attributes
- # dictionary
- self.attrs = attrs["__select"].copy()
- self.__dict__['_label'] = _getLabel(self.attrs)
- self.__dict__['id'] = self.attrs.get('id')
- self.__dict__["multiple"] = self.attrs.has_key("multiple")
- # the majority of the contents, label, and value dance already happened
- contents = attrs.get('contents')
- attrs = attrs.copy()
- del attrs["__select"]
-
- ListControl.__init__(self, type, name, self.attrs, select_default,
- called_as_base_class=True)
- self.disabled = self.attrs.has_key("disabled")
- self.readonly = self.attrs.has_key("readonly")
- if attrs.has_key('value'):
- # otherwise it is a marker 'select started' token
- o = Item(self, attrs)
- o.__dict__['_selected'] = attrs.has_key("selected")
- # add 'label' label and contents label, if different. If both are
- # provided, the 'label' label is used for display in HTML
- # 4.0-compliant browsers (and any lower spec? not sure) while the
- # contents are used for display in older or less-compliant
- # browsers. We make label objects for both, if the values are
- # different.
- label = attrs.get('label')
- if label:
- o._labels.append(Label({'__text': label}))
- if contents and contents != label:
- o._labels.append(Label({'__text': contents}))
- elif contents:
- o._labels.append(Label({'__text': contents}))
- self.items.append(o)
-
- def fixup(self):
- ListControl.fixup(self)
- found = [o for o in self.items if o.selected and not o.disabled]
- if not found:
- if not self.multiple or self._select_default:
- for o in self.items:
- if not o.disabled:
- o.selected = True
- break
- elif not self.multiple: # eliminate any duplicate selected.
- # Choose the last one.
- for o in found[:-1]:
- o.selected = False
-
-
-#---------------------------------------------------
-class SubmitControl(ScalarControl):
- """
- Covers:
-
- INPUT/SUBMIT
- BUTTON/SUBMIT
-
- """
- def __init__(self, type, name, attrs):
- ScalarControl.__init__(self, type, name, attrs)
- # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
- # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
- # to define this.
- if self.value is None: self.value = ""
- self.readonly = True
-
- def getLabels(self):
- res = []
- if self.value:
- res.append(Label({'__text': self.value}))
- res.extend(ScalarControl.getLabels(self))
- return res
-
- def is_of_kind(self, kind): return kind == "clickable"
-
- def _click(self, form, coord, return_type, request_class=urllib2.Request):
- self._clicked = coord
- r = form._switch_click(return_type, request_class)
- self._clicked = False
- return r
-
- def pairs(self):
- if not self._clicked:
- return []
- return ScalarControl.pairs(self)
-
-
-#---------------------------------------------------
-class ImageControl(SubmitControl):
- """
- Covers:
-
- INPUT/IMAGE
-
- Coordinates are specified using one of the HTMLForm.click* methods.
-
- """
- def __init__(self, type, name, attrs):
- SubmitControl.__init__(self, type, name, attrs)
- self.readonly = False
-
- def pairs(self):
- clicked = self._clicked
- if self.disabled or not clicked:
- return []
- name = self.name
- if name is None: return []
- pairs = [
- ("%s.x" % name, str(clicked[0])),
- ("%s.y" % name, str(clicked[1])),
- ]
- value = self._value
- if value:
- pairs.append((name, value))
- return pairs
-
- getLabels = ScalarControl.getLabels
-
-# aliases, just to make str(control) and str(form) clearer
-class PasswordControl(TextControl): pass
-class HiddenControl(TextControl): pass
-class TextareaControl(TextControl): pass
-class SubmitButtonControl(SubmitControl): pass
-
-
-def is_listcontrol(control): return control.is_of_kind("list")
-
-
-class HTMLForm:
- """Represents a single HTML <form> ... </form> element.
-
- A form consists of a sequence of controls that usually have names, and
- which can take on various values. The values of the various types of
- controls represent variously: text, zero-or-one-of-many or many-of-many
- choices, and files to be uploaded. Some controls can be clicked on to
- submit the form, and clickable controls' values sometimes include the
- coordinates of the click.
-
- Forms can be filled in with data to be returned to the server, and then
- submitted, using the click method to generate a request object suitable for
- passing to urllib2.urlopen (or the click_request_data or click_pairs
- methods if you're not using urllib2).
-
- import ClientForm
- forms = ClientForm.ParseFile(html, base_uri)
- form = forms[0]
-
- form["query"] = "Python"
- form.set("lots", "nr_results")
-
- response = urllib2.urlopen(form.click())
-
- Usually, HTMLForm instances are not created directly. Instead, the
- ParseFile or ParseResponse factory functions are used. If you do construct
- HTMLForm objects yourself, however, note that an HTMLForm instance is only
- properly initialised after the fixup method has been called (ParseFile and
- ParseResponse do this for you). See ListControl.__doc__ for the reason
- this is required.
-
- Indexing a form (form["control_name"]) returns the named Control's value
- attribute. Assignment to a form index (form["control_name"] = something)
- is equivalent to assignment to the named Control's value attribute. If you
- need to be more specific than just supplying the control's name, use the
- set_value and get_value methods.
-
- ListControl values are lists of item names. The list item's name is the
- value of the corresponding HTML element's "value" attribute.
-
- Example:
-
- <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
- <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
-
- defines a CHECKBOX control with name "cheeses" which has two items, named
- "leicester" and "cheddar".
-
- Another example:
-
- <SELECT name="more_cheeses">
- <OPTION>1</OPTION>
- <OPTION value="2" label="CHEDDAR">cheddar</OPTION>
- </SELECT>
-
- defines a SELECT control with name "more_cheeses" which has two items,
- named "1" and "2" (because the OPTION element's value HTML attribute
- defaults to the element contents).
-
- To set, clear or toggle individual list items, use the set and toggle
- methods. To set the whole value, do as for any other control:use indexing
- or the set_/get_value methods.
-
- Example:
-
- # select *only* the item named "cheddar"
- form["cheeses"] = ["cheddar"]
- # select "cheddar", leave other items unaffected
- form.set("cheddar", "cheeses")
-
- Some controls (RADIO and SELECT without the multiple attribute) can only
- have zero or one items selected at a time. Some controls (CHECKBOX and
- SELECT with the multiple attribute) can have multiple items selected at a
- time. To set the whole value of a ListControl, assign a sequence to a form
- index:
-
- form["cheeses"] = ["cheddar", "leicester"]
-
- If the ListControl is not multiple-selection, the assigned list must be of
- length one.
-
- To check whether a control has an item, or whether an item is selected,
- respectively:
-
- "cheddar" in form.possible_items("cheeses")
- "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
-
- Note that some list items may be disabled (see below).
-
- Note the following mistake:
-
- form[control_name] = control_value
- assert form[control_name] == control_value # not necessarily true
-
- The reason for this is that form[control_name] always gives the list items
- in the order they were listed in the HTML.
-
- List items (hence list values, too) can be referred to in terms of list
- item labels rather than list item names. Currently, this is only possible
- for SELECT controls (this is a bug). To use this feature, use the by_label
- arguments to the various HTMLForm methods. Note that it is *item* names
- (hence ListControl values also), not *control* names, that can be referred
- to by label.
-
- The question of default values of OPTION contents, labels and values is
- somewhat complicated: see SelectControl.__doc__ and
- ListControl.get_item_attrs.__doc__ if you think you need to know.
-
- Controls can be disabled or readonly. In either case, the control's value
- cannot be changed until you clear those flags (see example below).
- Disabled is the state typically represented by browsers by `greying out' a
- control. Disabled controls are not `successful' -- they don't cause data
- to get returned to the server. Readonly controls usually appear in
- browsers as read-only text boxes. Readonly controls are successful. List
- items can also be disabled. Attempts to select disabled items (with
- form[name] = value, or using the ListControl.set method, for example) fail.
- Attempts to clear disabled items are allowed.
-
- If a lot of controls are readonly, it can be useful to do this:
-
- form.set_all_readonly(False)
-
- To clear a control's value attribute, so that it is not successful (until a
- value is subsequently set):
-
- form.clear("cheeses")
-
- When you want to do several things with a single control, or want to do
- less common things, like changing which controls and items are disabled,
- you can get at a particular control:
-
- control = form.find_control("cheeses")
- control.disabled = False
- control.readonly = False
- control.set_item_disabled(False, "gruyere")
- control.set("gruyere")
-
- Most methods on HTMLForm just delegate to the contained controls, so see
- the docstrings of the various Control classes for further documentation.
- Most of these delegating methods take name, type, kind, id and nr arguments
- to specify the control to be operated on: see
- HTMLForm.find_control.__doc__.
-
- ControlNotFoundError (subclass of ValueError) is raised if the specified
- control can't be found. This includes occasions where a non-ListControl
- is found, but the method (set, for example) requires a ListControl.
- ItemNotFoundError (subclass of ValueError) is raised if a list item can't
- be found. ItemCountError (subclass of ValueError) is raised if an attempt
- is made to select more than one item and the control doesn't allow that, or
- set/get_single are called and the control contains more than one item.
- AttributeError is raised if a control or item is readonly or disabled and
- an attempt is made to alter its value.
-
- Security note: Remember that any passwords you store in HTMLForm instances
- will be saved to disk in the clear if you pickle them (directly or
- indirectly). The simplest solution to this is to avoid pickling HTMLForm
- objects. You could also pickle before filling in any password, or just set
- the password to "" before pickling.
-
-
- Public attributes:
-
- action: full (absolute URI) form action
- method: "GET" or "POST"
- enctype: form transfer encoding MIME type
- name: name of form (None if no name was specified)
- attrs: dictionary mapping original HTML form attributes to their values
-
- controls: list of Control instances; do not alter this list
- (instead, call form.new_control to make a Control and add it to the
- form, or control.add_to_form if you already have a Control instance)
-
-
-
- Methods for form filling:
- -------------------------
-
- Most of the these methods have very similar arguments. See
- HTMLForm.find_control.__doc__ for details of the name, type, kind and nr
- arguments. See above for a description of by_label.
-
- def find_control(self,
- name=None, type=None, kind=None, id=None, predicate=None,
- nr=None)
-
- get_value(name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
- set_value(value,
- name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
-
- set_all_readonly(readonly)
-
-
- Methods applying only to ListControls:
-
- possible_items(name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
-
- set(selected, item_name,
- name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
- toggle(item_name,
- name=None, type=None, id=None, nr=None,
- by_label=False)
-
- set_single(selected,
- name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
- toggle_single(name=None, type=None, kind=None, id=None, nr=None,
- by_label=False)
-
-
- Method applying only to FileControls:
-
- add_file(file_object,
- content_type="application/octet-stream", filename=None,
- name=None, id=None, nr=None)
-
-
- Methods applying only to clickable controls:
-
- click(name=None, type=None, id=None, nr=0, coord=(1,1))
- click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1))
- click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1))
-
- """
-
- type2class = {
- "text": TextControl,
- "password": PasswordControl,
- "hidden": HiddenControl,
- "textarea": TextareaControl,
-
- "isindex": IsindexControl,
-
- "file": FileControl,
-
- "button": IgnoreControl,
- "buttonbutton": IgnoreControl,
- "reset": IgnoreControl,
- "resetbutton": IgnoreControl,
-
- "submit": SubmitControl,
- "submitbutton": SubmitButtonControl,
- "image": ImageControl,
-
- "radio": RadioControl,
- "checkbox": CheckboxControl,
- "select": SelectControl,
- }
-
-#---------------------------------------------------
-# Initialisation. Use ParseResponse / ParseFile instead.
-
- def __init__(self, action, method="GET",
- enctype="application/x-www-form-urlencoded",
- name=None, attrs=None,
- request_class=urllib2.Request,
- forms=None, labels=None, id_to_labels=None):
- """
- In the usual case, use ParseResponse (or ParseFile) to create new
- HTMLForm objects.
-
- action: full (absolute URI) form action
- method: "GET" or "POST"
- enctype: form transfer encoding MIME type
- name: name of form
- attrs: dictionary mapping original HTML form attributes to their values
-
- """
- self.action = action
- self.method = method
- self.enctype = enctype
- self.name = name
- if attrs is not None:
- self.attrs = attrs.copy()
- else:
- self.attrs = {}
- self.controls = []
- self._request_class = request_class
- self._forms = forms # this is a semi-public API!
- self._labels = labels # this is a semi-public API!
- self._id_to_labels = id_to_labels # this is a semi-public API!
-
- def new_control(self, type, name, attrs,
- ignore_unknown=False, select_default=False):
- """Adds a new control to the form.
-
- This is usually called by ParseFile and ParseResponse. Don't call it
- youself unless you're building your own Control instances.
-
- Note that controls representing lists of items are built up from
- controls holding only a single list item. See ListControl.__doc__ for
- further information.
-
- type: type of control (see Control.__doc__ for a list)
- attrs: HTML attributes of control
- ignore_unknown: if true, use a dummy Control instance for controls of
- unknown type; otherwise, use a TextControl
- select_default: for RADIO and multiple-selection SELECT controls, pick
- the first item as the default if no 'selected' HTML attribute is
- present (this defaulting happens when the HTMLForm.fixup method is
- called)
-
- """
- type = string.lower(type)
- klass = self.type2class.get(type)
- if klass is None:
- if ignore_unknown:
- klass = IgnoreControl
- else:
- klass = TextControl
-
- a = attrs.copy()
- if issubclass(klass, ListControl):
- control = klass(type, name, a, select_default)
- else:
- control = klass(type, name, a)
- control.add_to_form(self)
-
- def fixup(self):
- """Normalise form after all controls have been added.
-
- This is usually called by ParseFile and ParseResponse. Don't call it
- youself unless you're building your own Control instances.
-
- This method should only be called once, after all controls have been
- added to the form.
-
- """
- for control in self.controls:
- control.fixup()
-
-#---------------------------------------------------
- def __str__(self):
- header = "%s %s %s" % (self.method, self.action, self.enctype)
- rep = [header]
- for control in self.controls:
- rep.append(" %s" % str(control))
- return "<%s>" % string.join(rep, "\n")
-
-#---------------------------------------------------
-# Form-filling methods.
-
- def __getitem__(self, name):
- return self.find_control(name).value
- def __setitem__(self, name, value):
- control = self.find_control(name)
- try:
- control.value = value
- except AttributeError, e:
- raise ValueError(str(e))
-
- def get_value(self,
- name=None, type=None, kind=None, id=None, nr=None,
- by_label=False):
- """Return value of control.
-
- If only name and value arguments are supplied, equivalent to
-
- form[name]
-
- """
- c = self.find_control(name, type, kind, id, nr=nr)
- if by_label:
- try:
- meth = c.get_value_by_label
- except AttributeError:
- raise NotImplementedError(
- "control '%s' does not yet support by_label" % c.name)
- else:
- return meth()
- else:
- return c.value
- def set_value(self, value,
- name=None, type=None, kind=None, id=None, nr=None,
- by_label=False):
- """Set value of control.
-
- If only name and value arguments are supplied, equivalent to
-
- form[name] = value
-
- """
- c = self.find_control(name, type, kind, id, nr=nr)
- if by_label:
- try:
- meth = c.set_value_by_label
- except AttributeError:
- raise NotImplementedError(
- "control '%s' does not yet support by_label" % c.name)
- else:
- meth(value)
- else:
- c.value = value
-
- def set_all_readonly(self, readonly):
- for control in self.controls:
- control.readonly = bool(readonly)
-
- def clear_all(self):
- """Clear the value attributes of all controls in the form.
-
- See HTMLForm.clear.__doc__.
-
- """
- for control in self.controls:
- control.clear()
-
- def clear(self,
- name=None, type=None, kind=None, id=None, nr=None):
- """Clear the value attributes of all controls in the form.
-
- As a result, the affected controls will not be successful until a value
- is subsequently set. AttributeError is raised on readonly controls.
-
- """
- c = self.find_control(name, type, kind, id, nr=nr)
- c.clear()
-
-
-#---------------------------------------------------
-# Form-filling methods applying only to ListControls.
-
- def possible_items(self, # deprecated
- name=None, type=None, kind=None, id=None, label=None,
- nr=None, by_label=False):
- """Return a list of all values that the specified control can take."""
- c = self._find_list_control(name, type, kind, id, label, nr)
- return c.possible_items(by_label)
-
- def set(self, selected, item_name, # deprecated
- name=None, type=None, kind=None, id=None, label=None, nr=None,
- by_label=False):
- """Select / deselect named list item.
-
- selected: boolean selected state
-
- """
- self._find_list_control(name, type, kind, id, label, nr).set(
- selected, item_name, by_label)
- def toggle(self, item_name, # deprecated
- name=None, type=None, kind=None, id=None, label=None, nr=None,
- by_label=False):
- """Toggle selected state of named list item."""
- self._find_list_control(name, type, kind, id, label, nr).toggle(
- item_name, by_label)
-
- def set_single(self, selected, # deprecated
- name=None, type=None, kind=None, id=None, label=None,
- nr=None, by_label=None):
- """Select / deselect list item in a control having only one item.
-
- If the control has multiple list items, ItemCountError is raised.
-
- This is just a convenience method, so you don't need to know the item's
- name -- the item name in these single-item controls is usually
- something meaningless like "1" or "on".
-
- For example, if a checkbox has a single item named "on", the following
- two calls are equivalent:
-
- control.toggle("on")
- control.toggle_single()
-
- """ # by_label ignored and deprecated
- self._find_list_control(
- name, type, kind, id, label, nr).set_single(selected)
- def toggle_single(self, name=None, type=None, kind=None, id=None,
- label=None, nr=None, by_label=None): # deprecated
- """Toggle selected state of list item in control having only one item.
-
- The rest is as for HTMLForm.set_single.__doc__.
-
- """ # by_label ignored and deprecated
- self._find_list_control(name, type, kind, id, label, nr).toggle_single()
-
-#---------------------------------------------------
-# Form-filling method applying only to FileControls.
-
- def add_file(self, file_object, content_type=None, filename=None,
- name=None, id=None, label=None, nr=None):
- """Add a file to be uploaded.
-
- file_object: file-like object (with read method) from which to read
- data to upload
- content_type: MIME content type of data to upload
- filename: filename to pass to server
-
- If filename is None, no filename is sent to the server.
-
- If content_type is None, the content type is guessed based on the
- filename and the data from read from the file object.
-
- XXX
- At the moment, guessed content type is always application/octet-stream.
- Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
- plain text.
-
- Note the following useful HTML attributes of file upload controls (see
- HTML 4.01 spec, section 17):
-
- accept: comma-separated list of content types that the server will
- handle correctly; you can use this to filter out non-conforming files
- size: XXX IIRC, this is indicative of whether form wants multiple or
- single files
- maxlength: XXX hint of max content length in bytes?
-
- """
- self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
- file_object, content_type, filename)
-
-#---------------------------------------------------
-# Form submission methods, applying only to clickable controls.
-
- def click(self, name=None, type=None, id=None, label=None, nr=0, coord=(1,1),
- request_class=urllib2.Request):
- """Return request that would result from clicking on a control.
-
- The request object is a urllib2.Request instance, which you can pass to
- urllib2.urlopen (or ClientCookie.urlopen).
-
- Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
- IMAGEs) can be clicked.
-
- Will click on the first clickable control, subject to the name, type
- and nr arguments (as for find_control). If no name, type, id or number
- is specified and there are no clickable controls, a request will be
- returned for the form in its current, un-clicked, state.
-
- IndexError is raised if any of name, type, id or nr is specified but no
- matching control is found. ValueError is raised if the HTMLForm has an
- enctype attribute that is not recognised.
-
- You can optionally specify a coordinate to click at, which only makes a
- difference if you clicked on an image.
-
- """
- return self._click(name, type, id, label, nr, coord, "request",
- self._request_class)
-
- def click_request_data(self,
- name=None, type=None, id=None, label=None,
- nr=0, coord=(1,1),
- request_class=urllib2.Request):
- """As for click method, but return a tuple (url, data, headers).
-
- You can use this data to send a request to the server. This is useful
- if you're using httplib or urllib rather than urllib2. Otherwise, use
- the click method.
-
- # Untested. Have to subclass to add headers, I think -- so use urllib2
- # instead!
- import urllib
- url, data, hdrs = form.click_request_data()
- r = urllib.urlopen(url, data)
-
- # Untested. I don't know of any reason to use httplib -- you can get
- # just as much control with urllib2.
- import httplib, urlparse
- url, data, hdrs = form.click_request_data()
- tup = urlparse(url)
- host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
- conn = httplib.HTTPConnection(host)
- if data:
- httplib.request("POST", path, data, hdrs)
- else:
- httplib.request("GET", path, headers=hdrs)
- r = conn.getresponse()
-
- """
- return self._click(name, type, id, label, nr, coord, "request_data",
- self._request_class)
-
- def click_pairs(self, name=None, type=None, id=None, label=None,
- nr=0, coord=(1,1)):
- """As for click_request_data, but returns a list of (key, value) pairs.
-
- You can use this list as an argument to ClientForm.urlencode. This is
- usually only useful if you're using httplib or urllib rather than
- urllib2 or ClientCookie. It may also be useful if you want to manually
- tweak the keys and/or values, but this should not be necessary.
- Otherwise, use the click method.
-
- Note that this method is only useful for forms of MIME type
- x-www-form-urlencoded. In particular, it does not return the
- information required for file upload. If you need file upload and are
- not using urllib2, use click_request_data.
-
- Also note that Python 2.0's urllib.urlencode is slightly broken: it
- only accepts a mapping, not a sequence of pairs, as an argument. This
- messes up any ordering in the argument. Use ClientForm.urlencode
- instead.
-
- """
- return self._click(name, type, id, label, nr, coord, "pairs",
- self._request_class)
-
-#---------------------------------------------------
-
- def find_control(self,
- name=None, type=None, kind=None, id=None, label=None,
- predicate=None, nr=None):
- """Locate and return some specific control within the form.
-
- At least one of the name, type, kind, predicate and nr arguments must
- be supplied. If no matching control is found, ControlNotFoundError is
- raised.
-
- If name is specified, then the control must have the indicated name.
-
- If type is specified then the control must have the specified type (in
- addition to the types possible for <input> HTML tags: "text",
- "password", "hidden", "submit", "image", "button", "radio", "checkbox",
- "file" we also have "reset", "buttonbutton", "submitbutton",
- "resetbutton", "textarea", "select" and "isindex").
-
- If kind is specified, then the control must fall into the specified
- group, each of which satisfies a particular interface. The types are
- "text", "list", "multilist", "singlelist", "clickable" and "file".
-
- If id is specified, then the control must have the indicated id.
-
- If predicate is specified, then the control must match that function.
- The predicate function is passed the control as its single argument,
- and should return a boolean value indicating whether the control
- matched.
-
- nr, if supplied, is the sequence number of the control (where 0 is the
- first). Note that control 0 is the first control matching all the
- other arguments (if supplied); it is not necessarily the first control
- in the form.
-
- """
- if ((name is None) and (type is None) and (kind is None) and
- (id is None) and (label is None) and (predicate is None) and
- (nr is None)):
- raise ValueError(
- "at least one argument must be supplied to specify control")
- if nr is None: nr = 0
-
- return self._find_control(name, type, kind, id, label, predicate, nr)
-
-#---------------------------------------------------
-# Private methods.
-
- def _find_list_control(self,
- name=None, type=None, kind=None, id=None,
- label=None, nr=None):
- if ((name is None) and (type is None) and (kind is None) and
- (id is None) and (label is None) and (nr is None)):
- raise ValueError(
- "at least one argument must be supplied to specify control")
- if nr is None: nr = 0
-
- return self._find_control(name, type, kind, id, label,
- is_listcontrol, nr)
-
- def _find_control(self, name, type, kind, id, label, predicate, nr):
- if (name is not None) and not isstringlike(name):
- raise TypeError("control name must be string-like")
- if (type is not None) and not isstringlike(type):
- raise TypeError("control type must be string-like")
- if (kind is not None) and not isstringlike(kind):
- raise TypeError("control kind must be string-like")
- if (id is not None) and not isstringlike(id):
- raise TypeError("control id must be string-like")
- if (label is not None) and not isstringlike(label):
- raise TypeError("control label must be string-like")
- if (predicate is not None) and not callable(predicate):
- raise TypeError("control predicate must be callable")
- if nr < 0: raise ValueError("control number must be a positive "
- "integer")
-
- orig_nr = nr
-
- for control in self.controls:
- if name is not None and name != control.name:
- continue
- if type is not None and type != control.type:
- continue
- if kind is not None and not control.is_of_kind(kind):
- continue
- if id is not None and id != control.id:
- continue
- if predicate and not predicate(control):
- continue
- if label:
- for l in control.getLabels():
- if l.text.find(label) > -1:
- break
- else:
- continue
- if nr:
- nr = nr - 1
- continue
- return control
-
- description = []
- if name is not None: description.append("name '%s'" % name)
- if type is not None: description.append("type '%s'" % type)
- if kind is not None: description.append("kind '%s'" % kind)
- if id is not None: description.append("id '%s'" % id)
- if label is not None: description.append("label '%s'" % label)
- if predicate is not None:
- description.append("predicate %s" % predicate)
- if orig_nr: description.append("nr %d" % orig_nr)
- description = string.join(description, ", ")
- raise ControlNotFoundError("no control matching "+description)
-
- def _click(self, name, type, id, label, nr, coord, return_type,
- request_class=urllib2.Request):
- try:
- control = self._find_control(
- name, type, "clickable", id, label, None, nr)
- except ControlNotFoundError:
- if ((name is not None) or (type is not None) or (id is not None) or
- (nr != 0)):
- raise
- # no clickable controls, but no control was explicitly requested,
- # so return state without clicking any control
- return self._switch_click(return_type, request_class)
- else:
- return control._click(self, coord, return_type, request_class)
-
- def _pairs(self):
- """Return sequence of (key, value) pairs suitable for urlencoding."""
- pairs = []
- for control in self.controls:
- pairs.extend(control.pairs())
- return pairs
-
- def _request_data(self):
- """Return a tuple (url, data, headers)."""
- method = string.upper(self.method)
- #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action)
- parts = urlparse.urlparse(self.action)
- rest, (query, frag) = parts[:-2], parts[-2:]
-
- if method == "GET":
- if self.enctype != "application/x-www-form-urlencoded":
- raise ValueError(
- "unknown GET form encoding type '%s'" % self.enctype)
- parts = rest + (urlencode(self._pairs()), "")
- uri = urlparse.urlunparse(parts)
- return uri, None, []
- elif method == "POST":
- parts = rest + (query, "")
- uri = urlparse.urlunparse(parts)
- if self.enctype == "application/x-www-form-urlencoded":
- return (uri, urlencode(self._pairs()),
- [("Content-type", self.enctype)])
- elif self.enctype == "multipart/form-data":
- data = StringIO()
- http_hdrs = []
- mw = MimeWriter(data, http_hdrs)
- f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
- prefix=0)
- for control in self.controls:
- control._write_mime_data(mw)
- mw.lastpart()
- return uri, data.getvalue(), http_hdrs
- else:
- raise ValueError(
- "unknown POST form encoding type '%s'" % self.enctype)
- else:
- raise ValueError("Unknown method '%s'" % method)
-
- def _switch_click(self, return_type, request_class=urllib2.Request):
- # This is called by HTMLForm and clickable Controls to hide switching
- # on return_type.
- if return_type == "pairs":
- return self._pairs()
- elif return_type == "request_data":
- return self._request_data()
- else:
- req_data = self._request_data()
- req = request_class(req_data[0], req_data[1])
- for key, val in req_data[2]:
- req.add_header(key, val)
- return req
Modified: Zope3/trunk/src/zope/testbrowser/__init__.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/__init__.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/zope/testbrowser/__init__.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -16,51 +16,4 @@
$Id$
"""
-# TODO this should be removed once John J. Lee releases the new versions of
-# ClientForm and pullparser that have the code we rely on here. At that point
-# we should also remove ClientForm.py from this directory.
-import sys
-
-# stitch in ClientCookie
-from zope.testbrowser import ClientCookie
-
-if 'ClientCookie' not in sys.modules:
- sys.modules['ClientCookie'] = ClientCookie
-else:
- assert sys.modules['ClientCookie'] is ClientCookie
-import ClientCookie as x
-assert x is ClientCookie
-
-# stitch in ClientForm
-from zope.testbrowser import ClientForm
-
-if 'ClientForm' not in sys.modules:
- sys.modules['ClientForm'] = ClientForm
-else:
- assert sys.modules['ClientForm'] is ClientForm
-import ClientForm as x
-assert x is ClientForm
-
-# stitch in pullparser
-from zope.testbrowser import pullparser
-
-if 'pullparser' not in sys.modules:
- sys.modules['pullparser'] = pullparser
-else:
- assert sys.modules['pullparser'] is pullparser
-import pullparser as x
-assert x is pullparser
-# end TODO
-
-# stitch in _mechanize
-from zope.testbrowser import mechanize
-
-if 'mechanize' not in sys.modules:
- sys.modules['mechanize'] = mechanize
-else:
- assert sys.modules['mechanize'] is mechanize
-import mechanize as x
-assert x is mechanize
-# end TODO
-
from testing import Browser
Modified: Zope3/trunk/src/zope/testbrowser/browser.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/browser.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/zope/testbrowser/browser.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -234,7 +234,18 @@
"""See zope.testbrowser.interfaces.IBrowser"""
self._start_timer()
self.mech_browser.back(count)
+ # we want to ignore history of 302 redirects. If we go back to far,
+ # mechanize will raise a BrowserStateError as usual
+ while self.mech_browser.response() is None:
+ self.mech_browser.back()
self._stop_timer()
+ # TODO this is a hack to get around a bug in mechanize
+ response = self.mech_browser.response()
+ if response is not None:
+ response.wrapped.url = response.url
+ response.wrapped.headers = response.headers
+ response.close = lambda: None
+ # end hack
self._changed()
def addHeader(self, key, value):
@@ -276,12 +287,12 @@
phantom or control.type=='select'):
for i in control.items:
- for l in i.getLabels():
+ for l in i.get_labels():
if matches(l.text):
found.append((i, f))
break
if not phantom:
- for l in control.getLabels():
+ for l in control.get_labels():
if matches(l.text):
found.append((control, f))
break
@@ -421,7 +432,7 @@
def fget(self):
if (self.type == 'checkbox' and
len(self.mech_control.items) == 1 and
- self.mech_control.items[0].value == 'on'):
+ self.mech_control.items[0].name == 'on'):
return self.mech_control.items[0].selected
return self.mech_control.value
@@ -472,21 +483,21 @@
res = []
for item in self.mech_control.items:
if not item.disabled:
- for label in item.getLabels():
+ for label in item.get_labels():
if label.text:
res.append(label.text)
break
- else:
- res.append(None)
+ else:
+ res.append(None)
return res
@property
def options(self):
"""See zope.testbrowser.interfaces.IListControl"""
if (self.type == 'checkbox' and len(self.mech_control.items) == 1 and
- self.mech_control.items[0].value == 'on'):
+ self.mech_control.items[0].name == 'on'):
return [True]
- return [i.value for i in self.mech_control.items if not i.disabled]
+ return [i.name for i in self.mech_control.items if not i.disabled]
@property
def controls(self):
@@ -505,10 +516,10 @@
onlyOne([label, value], '"label" and "value"')
if label is not None:
- options = self.mech_control.items_from_label(label)
+ options = self.mech_control.get_items(label=label)
msg = 'label %r' % label
elif value is not None:
- options = self.mech_control.items_from_value(value)
+ options = self.mech_control.get_items(name=value)
msg = 'value %r' % value
res = controlFactory(
disambiguate(options, msg, index), self.mech_form, self.browser)
@@ -551,7 +562,7 @@
if self._browser_counter != self.browser._counter:
raise interfaces.ExpiredError
res = controlFactory(
- self.mech_item.control, self.mech_form, self.browser)
+ self.mech_item._control, self.mech_form, self.browser)
self.__dict__['control'] = res
return res
@@ -584,8 +595,8 @@
def __repr__(self):
return "<%s name=%r type=%r optionValue=%r>" % (
- self.__class__.__name__, self.mech_item.control.name,
- self.mech_item.control.type, self.optionValue)
+ self.__class__.__name__, self.mech_item._control.name,
+ self.mech_item._control.type, self.optionValue)
class Form(SetattrErrorsMixin):
Deleted: Zope3/trunk/src/zope/testbrowser/pullparser.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/pullparser.py 2005-11-01 16:29:00 UTC (rev 39817)
+++ Zope3/trunk/src/zope/testbrowser/pullparser.py 2005-11-01 17:34:14 UTC (rev 39818)
@@ -1,350 +0,0 @@
-"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
-
-Examples
-
-This program extracts all links from a document. It will print one
-line for each link, containing the URL and the textual description
-between the <A>...</A> tags:
-
-import pullparser, sys
-f = file(sys.argv[1])
-p = pullparser.PullParser(f)
-for token in p.tags("a"):
- if token.type == "endtag": continue
- url = dict(token.attrs).get("href", "-")
- text = p.get_compressed_text(endat=("endtag", "a"))
- print "%s\t%s" % (url, text)
-
-This program extracts the <TITLE> from the document:
-
-import pullparser, sys
-f = file(sys.argv[1])
-p = pullparser.PullParser(f)
-if p.get_tag("title"):
- title = p.get_compressed_text()
- print "Title: %s" % title
-
-
-Copyright 2003-2004 John J. Lee <jjl at pobox.com>
-Copyright 1998-2001 Gisle Aas (original libwww-perl code)
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License.
-
-"""
-
-from __future__ import generators
-
-import re, htmlentitydefs
-import HTMLParser
-
-__version__ = (0, 0, 6, None, None) # 0.0.6b
-
-class NoMoreTokensError(Exception): pass
-
-class Token:
- """Represents an HTML tag, declaration, processing instruction etc.
-
- Behaves as both a tuple-like object (ie. iterable) and has attributes
- .type, .data and .attrs.
-
- >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
- >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
- True
- >>> t.type, t.data == "starttag", "a"
- True
- >>> t.attrs == [("href", "http://www.python.org/")]
- True
-
- Public attributes
-
- type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
- "data", "comment", "decl", "pi", after the corresponding methods of
- HTMLParser.HTMLParser
- data: For a tag, the tag name; otherwise, the relevant data carried by the
- tag, as a string
- attrs: list of (name, value) pairs representing HTML attributes
- (or None if token does not represent an opening tag)
-
- """
- def __init__(self, type, data, attrs=None):
- self.type = type
- self.data = data
- self.attrs = attrs
- def __iter__(self):
- return iter((self.type, self.data, self.attrs))
- def __eq__(self, other):
- type, data, attrs = other
- if (self.type == type and
- self.data == data and
- self.attrs == attrs):
- return True
- else:
- return False
- def __ne__(self, other): return not self.__eq__(other)
- def __repr__(self):
- args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
- return self.__class__.__name__+"(%s)" % args
-
-def iter_until_exception(fn, exception, *args, **kwds):
- while 1:
- try:
- yield fn(*args, **kwds)
- except exception:
- raise StopIteration
-
-def caller():
- try:
- raise SyntaxError
- except:
- import sys
- return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
-
-def unescape(data, entities):
- if data is None or '&' not in data:
- return data
-
- def replace_entities(match):
- ent = match.group()
- repl = entities.get(ent, ent)
- return repl
-
- return re.sub(r'&\S+;', replace_entities, data)
-
-def get_entitydefs():
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- entitydefs["&%s;" % name] = char
- return entitydefs
-
-
-class _AbstractParser:
- chunk = 1024
- compress_re = re.compile(r"\s+")
- entitydefs = htmlentitydefs.entitydefs
- def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
- encoding="ascii", entitydefs=None):
- """
- fh: file-like object (only a .read() method is required) from which to
- read HTML to be parsed
- textify: mapping used by .get_text() and .get_compressed_text() methods
- to represent opening tags as text
- encoding: encoding used to encode numeric character references by
- .get_text() and .get_compressed_text() ("ascii" by default)
- entitydefs: mapping like {'&': '&', ...} containing HTML entity
- definitions (a sensible default is used)
-
- If the element name of an opening tag matches a key in the textify
- mapping then that tag is converted to text. The corresponding value is
- used to specify which tag attribute to obtain the text from. textify
- maps from element names to either:
-
- - an HTML attribute name, in which case the HTML attribute value is
- used as its text value along with the element name in square
- brackets (eg."alt text goes here[IMG]", or, if the alt attribute
- were missing, just "[IMG]")
- - a callable object (eg. a function) which takes a Token and returns
- the string to be used as its text value
-
- If textify has no key for an element name, nothing is substituted for
- the opening tag.
-
- Public attributes:
-
- encoding and textify: see above
-
- """
- self._fh = fh
- self._tokenstack = [] # FIFO
- self.textify = textify
- self.encoding = encoding
- if entitydefs is None:
- entitydefs = get_entitydefs()
- self._entitydefs = entitydefs
-
- def __iter__(self): return self
-
- def tags(self, *names):
- return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
-
- def tokens(self, *tokentypes):
- return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
-
- def next(self):
- try:
- return self.get_token()
- except NoMoreTokensError:
- raise StopIteration()
-
- def get_token(self, *tokentypes):
- """Pop the next Token object from the stack of parsed tokens.
-
- If arguments are given, they are taken to be token types in which the
- caller is interested: tokens representing other elements will be
- skipped. Element names must be given in lower case.
-
- Raises NoMoreTokensError.
-
- """
- while 1:
- while self._tokenstack:
- token = self._tokenstack.pop(0)
- if tokentypes:
- if token.type in tokentypes:
- return token
- else:
- return token
- data = self._fh.read(self.chunk)
- if not data:
- raise NoMoreTokensError()
- self.feed(data)
-
- def unget_token(self, token):
- """Push a Token back onto the stack."""
- self._tokenstack.insert(0, token)
-
- def get_tag(self, *names):
- """Return the next Token that represents an opening or closing tag.
-
- If arguments are given, they are taken to be element names in which the
- caller is interested: tags representing other elements will be skipped.
- Element names must be given in lower case.
-
- Raises NoMoreTokensError.
-
- """
- while 1:
- tok = self.get_token()
- if tok.type not in ["starttag", "endtag", "startendtag"]:
- continue
- if names:
- if tok.data in names:
- return tok
- else:
- return tok
-
- def get_text(self, endat=None):
- """Get some text.
-
- endat: stop reading text at this tag (the tag is included in the
- returned text); endtag is a tuple (type, name) where type is
- "starttag", "endtag" or "startendtag", and name is the element name of
- the tag (element names must be given in lower case)
-
- If endat is not given, .get_text() will stop at the next opening or
- closing tag, or when there are no more tokens (no exception is raised).
- Note that .get_text() includes the text representation (if any) of the
- opening tag, but pushes the opening tag back onto the stack. As a
- result, if you want to call .get_text() again, you need to call
- .get_tag() first (unless you want an empty string returned when you
- next call .get_text()).
-
- Entity references are translated using the entitydefs attribute (a
- mapping from names to characters like that provided by the standard
- module htmlentitydefs). Named entity references that are not in this
- mapping are left unchanged.
-
- The textify attribute is used to translate opening tags into text: see
- the class docstring.
-
- """
- text = []
- tok = None
- while 1:
- try:
- tok = self.get_token()
- except NoMoreTokensError:
- # unget last token (not the one we just failed to get)
- if tok: self.unget_token(tok)
- break
- if tok.type == "data":
- text.append(tok.data)
- elif tok.type == "entityref":
- name = tok.data
- if name in self.entitydefs:
- t = self.entitydefs[name]
- else:
- t = "&%s;" % name
- text.append(t)
- elif tok.type == "charref":
- name = tok.data
- t = unichr(int(name)).encode(self.encoding)
- text.append(t)
- elif tok.type in ["starttag", "endtag", "startendtag"]:
- tag_name = tok.data
- if tok.type in ["starttag", "startendtag"]:
- alt = self.textify.get(tag_name)
- if alt is not None:
- if callable(alt):
- text.append(alt(tok))
- elif tok.attrs is not None:
- for k, v in tok.attrs:
- if k == alt:
- text.append(v)
- text.append("[%s]" % tag_name.upper())
- if endat is None or endat == (tok.type, tag_name):
- self.unget_token(tok)
- break
- return "".join(text)
-
- def get_compressed_text(self, *args, **kwds):
- """
- As .get_text(), but collapses each group of contiguous whitespace to a
- single space character, and removes all initial and trailing
- whitespace.
-
- """
- text = self.get_text(*args, **kwds)
- text = text.strip()
- return self.compress_re.sub(" ", text)
-
- def handle_startendtag(self, tag, attrs):
- self._tokenstack.append(Token("startendtag", tag, attrs))
- def handle_starttag(self, tag, attrs):
- self._tokenstack.append(Token("starttag", tag, attrs))
- def handle_endtag(self, tag):
- self._tokenstack.append(Token("endtag", tag))
- def handle_charref(self, name):
- self._tokenstack.append(Token("charref", name))
- def handle_entityref(self, name):
- self._tokenstack.append(Token("entityref", name))
- def handle_data(self, data):
- self._tokenstack.append(Token("data", data))
- def handle_comment(self, data):
- self._tokenstack.append(Token("comment", data))
- def handle_decl(self, decl):
- self._tokenstack.append(Token("decl", decl))
- def unknown_decl(self, data):
- # XXX should this call self.error instead?
- #self.error("unknown declaration: " + `data`)
- self._tokenstack.append(Token("decl", data))
- def handle_pi(self, data):
- self._tokenstack.append(Token("pi", data))
-
- def unescape_attr(self, name):
- return unescape(name, self._entitydefs)
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- escaped_attrs.append((key, self.unescape_attr(val)))
- return escaped_attrs
-
-class PullParser(_AbstractParser, HTMLParser.HTMLParser):
- def __init__(self, *args, **kwds):
- HTMLParser.HTMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
-
-import sgmllib
-class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
- def __init__(self, *args, **kwds):
- sgmllib.SGMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
- def unknown_starttag(self, tag, attrs):
- attrs = self.unescape_attrs(attrs)
- self._tokenstack.append(Token("starttag", tag, attrs))
- def unknown_endtag(self, tag):
- self._tokenstack.append(Token("endtag", tag))
More information about the Zope3-Checkins
mailing list