[Zope3-checkins]
SVN: Zope3/branches/adamg-mechanize-update/src/mechanize/_ untested
Adam Groszer
agroszer at gmail.com
Fri Jul 13 09:21:23 EDT 2007
Log message for revision 77856:
untested
Changed:
U Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
A Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py
D Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py
U Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py
-=-
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/__init__.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,3 +1,87 @@
+__all__ = [
+ 'AbstractBasicAuthHandler',
+ 'AbstractDigestAuthHandler',
+ 'BaseHandler',
+ 'Browser',
+ 'BrowserStateError',
+ 'CacheFTPHandler',
+ 'ContentTooShortError',
+ 'Cookie',
+ 'CookieJar',
+ 'CookiePolicy',
+ 'DefaultCookiePolicy',
+ 'DefaultFactory',
+ 'FTPHandler',
+ 'Factory',
+ 'FileCookieJar',
+ 'FileHandler',
+ 'FormNotFoundError',
+ 'FormsFactory',
+ 'GopherError',
+ 'GopherHandler',
+ 'HTTPBasicAuthHandler',
+ 'HTTPCookieProcessor',
+ 'HTTPDefaultErrorHandler',
+ 'HTTPDigestAuthHandler',
+ 'HTTPEquivProcessor',
+ 'HTTPError',
+ 'HTTPErrorProcessor',
+ 'HTTPHandler',
+ 'HTTPPasswordMgr',
+ 'HTTPPasswordMgrWithDefaultRealm',
+ 'HTTPProxyPasswordMgr',
+ 'HTTPRedirectDebugProcessor',
+ 'HTTPRedirectHandler',
+ 'HTTPRefererProcessor',
+ 'HTTPRefreshProcessor',
+ 'HTTPRequestUpgradeProcessor',
+ 'HTTPResponseDebugProcessor',
+ 'HTTPRobotRulesProcessor',
+ 'HTTPSClientCertMgr',
+ 'HTTPSHandler',
+ 'HeadParser',
+ 'History',
+ 'LWPCookieJar',
+ 'Link',
+ 'LinkNotFoundError',
+ 'LinksFactory',
+ 'LoadError',
+ 'MSIECookieJar',
+ 'MozillaCookieJar',
+ 'OpenerDirector',
+ 'OpenerFactory',
+ 'ParseError',
+ 'ProxyBasicAuthHandler',
+ 'ProxyDigestAuthHandler',
+ 'ProxyHandler',
+ 'Request',
+ 'ResponseUpgradeProcessor',
+ 'RobotExclusionError',
+ 'RobustFactory',
+ 'RobustFormsFactory',
+ 'RobustLinksFactory',
+ 'RobustTitleFactory',
+ 'SeekableProcessor',
+ 'SeekableResponseOpener',
+ 'TitleFactory',
+ 'URLError',
+ 'USE_BARE_EXCEPT',
+ 'UnknownHandler',
+ 'UserAgent',
+ 'UserAgentBase',
+ 'XHTMLCompatibleHeadParser',
+ '__version__',
+ 'build_opener',
+ 'install_opener',
+ 'lwp_cookie_str',
+ 'make_response',
+ 'request_host',
+ 'response_seek_wrapper', # XXX deprecate in public interface?
+ 'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper()
+ 'str2time',
+ 'urlopen',
+ 'urlretrieve']
+
from _mechanize import __version__
# high-level stateful browser-style interface
@@ -2,8 +86,9 @@
from _mechanize import \
- Browser, \
+ Browser, History, \
BrowserStateError, LinkNotFoundError, FormNotFoundError
# configurable URL-opener interface
-from _useragent import UserAgent
+from _useragent import UserAgentBase, UserAgent
from _html import \
+ ParseError, \
Link, \
@@ -14,19 +99,20 @@
RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
# urllib2 work-alike interface (part from mechanize, part from urllib2)
+# This is a superset of the urllib2 interface.
from _urllib2 import *
# misc
+from _opener import ContentTooShortError, OpenerFactory, urlretrieve
from _util import http2time as str2time
-from _util import response_seek_wrapper, make_response
-from _urllib2_support import HeadParser
+from _response import \
+ response_seek_wrapper, seek_wrapped_response, make_response
+from _http import HeadParser
try:
- from _urllib2_support import XHTMLCompatibleHeadParser
+ from _http import XHTMLCompatibleHeadParser
except ImportError:
pass
-#from _gzip import HTTPGzipProcessor # crap ATM
-
# cookies
from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
CookieJar, FileCookieJar, LoadError, request_host
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_auth.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -11,10 +11,11 @@
"""
-import re, base64, urlparse, posixpath, md5, sha
+import re, base64, urlparse, posixpath, md5, sha, sys, copy
from urllib2 import BaseHandler
-from urllib import getproxies, unquote, splittype, splituser, splitpasswd
+from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \
+ splitport
def _parse_proxy(proxy):
@@ -135,32 +136,45 @@
# uri could be a single URI or a sequence
if isinstance(uri, basestring):
uri = [uri]
- uri = tuple(map(self.reduce_uri, uri))
if not realm in self.passwd:
self.passwd[realm] = {}
- self.passwd[realm][uri] = (user, passwd)
+ for default_port in True, False:
+ reduced_uri = tuple(
+ [self.reduce_uri(u, default_port) for u in uri])
+ self.passwd[realm][reduced_uri] = (user, passwd)
def find_user_password(self, realm, authuri):
domains = self.passwd.get(realm, {})
- authuri = self.reduce_uri(authuri)
- for uris, authinfo in domains.iteritems():
- for uri in uris:
- if self.is_suburi(uri, authuri):
- return authinfo
+ for default_port in True, False:
+ reduced_authuri = self.reduce_uri(authuri, default_port)
+ for uris, authinfo in domains.iteritems():
+ for uri in uris:
+ if self.is_suburi(uri, reduced_authuri):
+ return authinfo
return None, None
- def reduce_uri(self, uri):
- """Accept netloc or URI and extract only the netloc and path"""
+ def reduce_uri(self, uri, default_port=True):
+ """Accept authority or URI and extract only the authority and path."""
+ # note HTTP URLs do not have a userinfo component
parts = urlparse.urlsplit(uri)
if parts[1]:
# URI
- return parts[1], parts[2] or '/'
- elif parts[0]:
- # host:port
- return uri, '/'
+ scheme = parts[0]
+ authority = parts[1]
+ path = parts[2] or '/'
else:
- # host
- return parts[2], '/'
+ # host or host:port
+ scheme = None
+ authority = uri
+ path = '/'
+ host, port = splitport(authority)
+ if default_port and port is None and scheme is not None:
+ dport = {"http": 80,
+ "https": 443,
+ }.get(scheme)
+ if dport is not None:
+ authority = "%s:%d" % (host, dport)
+ return authority, path
def is_suburi(self, base, test):
"""Check if test is below base in a URI tree
@@ -220,8 +234,10 @@
auth = 'Basic %s' % base64.encodestring(raw).strip()
if req.headers.get(self.auth_header, None) == auth:
return None
- req.add_header(self.auth_header, auth)
- return self.parent.open(req)
+ newreq = copy.copy(req)
+ newreq.add_header(self.auth_header, auth)
+ newreq.visit = False
+ return self.parent.open(newreq)
else:
return None
@@ -311,9 +327,10 @@
auth_val = 'Digest %s' % auth
if req.headers.get(self.auth_header, None) == auth_val:
return None
- req.add_unredirected_header(self.auth_header, auth_val)
- resp = self.parent.open(req)
- return resp
+ newreq = copy.copy(req)
+ newreq.add_unredirected_header(self.auth_header, auth_val)
+ newreq.visit = False
+ return self.parent.open(newreq)
def get_cnonce(self, nonce):
# The cnonce-value is an opaque
@@ -404,6 +421,7 @@
"""
auth_header = 'Authorization'
+ handler_order = 490
def http_error_401(self, req, fp, code, msg, headers):
host = urlparse.urlparse(req.get_full_url())[1]
@@ -416,6 +434,7 @@
class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
auth_header = 'Proxy-Authorization'
+ handler_order = 490
def http_error_407(self, req, fp, code, msg, headers):
host = req.get_host()
@@ -425,7 +444,7 @@
return retry
-
+# XXX ugly implementation, should probably not bother deriving
class HTTPProxyPasswordMgr(HTTPPasswordMgr):
# has default realm and host/port
def add_password(self, realm, uri, user, passwd):
@@ -436,32 +455,34 @@
uris = uri
passwd_by_domain = self.passwd.setdefault(realm, {})
for uri in uris:
- uri = self.reduce_uri(uri)
- passwd_by_domain[uri] = (user, passwd)
+ for default_port in True, False:
+ reduced_uri = self.reduce_uri(uri, default_port)
+ passwd_by_domain[reduced_uri] = (user, passwd)
def find_user_password(self, realm, authuri):
- perms = [(realm, authuri), (None, authuri)]
+ attempts = [(realm, authuri), (None, authuri)]
# bleh, want default realm to take precedence over default
# URI/authority, hence this outer loop
for default_uri in False, True:
- for realm, authuri in perms:
+ for realm, authuri in attempts:
authinfo_by_domain = self.passwd.get(realm, {})
- reduced_authuri = self.reduce_uri(authuri)
- for uri, authinfo in authinfo_by_domain.iteritems():
- if uri is None and not default_uri:
- continue
- if self.is_suburi(uri, reduced_authuri):
- return authinfo
- user, password = None, None
+ for default_port in True, False:
+ reduced_authuri = self.reduce_uri(authuri, default_port)
+ for uri, authinfo in authinfo_by_domain.iteritems():
+ if uri is None and not default_uri:
+ continue
+ if self.is_suburi(uri, reduced_authuri):
+ return authinfo
+ user, password = None, None
- if user is not None:
- break
+ if user is not None:
+ break
return user, password
- def reduce_uri(self, uri):
+ def reduce_uri(self, uri, default_port=True):
if uri is None:
return None
- return HTTPPasswordMgr.reduce_uri(self, uri)
+ return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
def is_suburi(self, base, test):
if base is None:
@@ -469,3 +490,11 @@
hostport, path = test
base = (hostport, "/")
return HTTPPasswordMgr.is_suburi(self, base, test)
+
+
+class HTTPSClientCertMgr(HTTPPasswordMgr):
+ # implementation inheritance: this is not a proper subclass
+ def add_key_cert(self, uri, key_file, cert_file):
+ self.add_password(None, uri, key_file, cert_file)
+ def find_key_cert(self, authuri):
+ return HTTPPasswordMgr.find_user_password(self, None, authuri)
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,1080 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+v2.1.1
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
+into a tree representation. It provides methods and Pythonic idioms
+that make it easy to search and modify the tree.
+
+A well-formed XML/HTML document will yield a well-formed data
+structure. An ill-formed XML/HTML document will yield a
+correspondingly ill-formed data structure. If your document is only
+locally well-formed, you can use this library to find and process the
+well-formed part of it. The BeautifulSoup class has heuristics for
+obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup has no external dependencies. It works with Python 2.2
+and up.
+
+Beautiful Soup defines classes for four different parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid.
+
+ * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
+ that trips up BeautifulSoup.
+
+ * BeautifulSOAP, for making it easier to parse XML documents that use
+ lots of subelements containing a single string, where you'd prefer
+ they put that string into an attribute (such as SOAP messages).
+
+You can subclass BeautifulStoneSoup or BeautifulSoup to create a
+parsing strategy specific to an XML schema or a particular bizarre
+HTML document. Typically your subclass would just override
+SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr at segfault.org)"
+__version__ = "2.1.1"
+__date__ = "$Date$"
+__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
+__license__ = "PSF"
+
+from sgmllib import SGMLParser, SGMLParseError
+import types
+import re
+import sgmllib
+
+#This code makes Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+
+class NullType(object):
+
+ """Similar to NoneType with a corresponding singleton instance
+ 'Null' that, unlike None, accepts any message and returns itself.
+
+ Examples:
+ >>> Null("send", "a", "message")("and one more",
+ ... "and what you get still") is Null
+ True
+ """
+
+ def __new__(cls): return Null
+ def __call__(self, *args, **kwargs): return Null
+## def __getstate__(self, *args): return Null
+ def __getattr__(self, attr): return Null
+ def __getitem__(self, item): return Null
+ def __setattr__(self, attr, value): pass
+ def __setitem__(self, item, value): pass
+ def __len__(self): return 0
+ # FIXME: is this a python bug? otherwise ``for x in Null: pass``
+ # never terminates...
+ def __iter__(self): return iter([])
+ def __contains__(self, item): return False
+ def __repr__(self): return "Null"
+Null = object.__new__(NullType)
+
+class PageElement:
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=Null, previous=Null):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = Null
+ self.previousSibling = Null
+ self.nextSibling = Null
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def findNext(self, name=None, attrs={}, text=None):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._first(self.fetchNext, name, attrs, text)
+ firstNext = findNext
+
+ def fetchNext(self, name=None, attrs={}, text=None, limit=None):
+ """Returns all items that match the given criteria and appear
+ before after Tag in the document."""
+ return self._fetch(name, attrs, text, limit, self.nextGenerator)
+
+ def findNextSibling(self, name=None, attrs={}, text=None):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._first(self.fetchNextSiblings, name, attrs, text)
+ firstNextSibling = findNextSibling
+
+ def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
+
+ def findPrevious(self, name=None, attrs={}, text=None):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._first(self.fetchPrevious, name, attrs, text)
+
+ def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._fetch(name, attrs, text, limit, self.previousGenerator)
+ firstPrevious = findPrevious
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._first(self.fetchPreviousSiblings, name, attrs, text)
+ firstPreviousSibling = findPreviousSibling
+
+ def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._fetch(name, attrs, text, limit,
+ self.previousSiblingGenerator)
+
+ def findParent(self, name=None, attrs={}):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ r = Null
+ l = self.fetchParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+ firstParent = findParent
+
+ def fetchParents(self, name=None, attrs={}, limit=None):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+ return self._fetch(name, attrs, None, limit, self.parentGenerator)
+
+ #These methods do the real heavy lifting.
+
+ def _first(self, method, name, attrs, text):
+ r = Null
+ l = method(name, attrs, text, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def _fetch(self, name, attrs, text, limit, generator):
+ "Iterates over a generator looking for things that match."
+ if not hasattr(attrs, 'items'):
+ attrs = {'class' : attrs}
+
+ results = []
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ found = None
+ if isinstance(i, Tag):
+ if not text:
+ if not name or self._matches(i, name):
+ match = True
+ for attr, matchAgainst in attrs.items():
+ check = i.get(attr)
+ if not self._matches(check, matchAgainst):
+ match = False
+ break
+ if match:
+ found = i
+ elif text:
+ if self._matches(i, text):
+ found = i
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #Generators that can be used to navigate starting from both
+ #NavigableTexts and Tags.
+ def nextGenerator(self):
+ i = self
+ while i:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i:
+ i = i.parent
+ yield i
+
+ def _matches(self, chunk, howToMatch):
+ #print 'looking for %s in %s' % (howToMatch, chunk)
+ #
+ # If given a list of items, return true if the list contains a
+ # text element that matches.
+ if isList(chunk) and not isinstance(chunk, Tag):
+ for tag in chunk:
+ if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
+ return True
+ return False
+ if callable(howToMatch):
+ return howToMatch(chunk)
+ if isinstance(chunk, Tag):
+ #Custom match methods take the tag as an argument, but all other
+ #ways of matching match the tag name as a string
+ chunk = chunk.name
+ #Now we know that chunk is a string
+ if not isinstance(chunk, basestring):
+ chunk = str(chunk)
+ if hasattr(howToMatch, 'match'):
+ # It's a regexp object.
+ return howToMatch.search(chunk)
+ if isList(howToMatch):
+ return chunk in howToMatch
+ if hasattr(howToMatch, 'items'):
+ return howToMatch.has_key(chunk)
+ #It's just a string
+ return str(howToMatch) == chunk
+
+class NavigableText(PageElement):
+
+ def __getattr__(self, attr):
+ "For backwards compatibility, text.string gives you text"
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+class NavigableString(str, NavigableText):
+ pass
+
+class NavigableUnicodeString(unicode, NavigableText):
+ pass
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def __init__(self, name, attrs=None, parent=Null, previous=Null):
+ "Basic constructor."
+ self.name = name
+ if attrs == None:
+ attrs = []
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ fetch() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.fetch, args, kwargs)
+
+ def __getattr__(self, tag):
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.first(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.first(tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self):
+ """Renders this tag as a string."""
+ return str(self)
+
+ def __unicode__(self):
+ return self.__str__(1)
+
+ def __str__(self, needUnicode=None, showStructureIndent=None):
+ """Returns a string or Unicode representation of this tag and
+ its contents.
+
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ attrs.append('%s="%s"' % (key, val))
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing():
+ close = ' /'
+ else:
+ closeTag = '</%s>' % self.name
+ indentIncrement = None
+ if showStructureIndent != None:
+ indentIncrement = showStructureIndent
+ if not self.hidden:
+ indentIncrement += 1
+ contents = self.renderContents(indentIncrement, needUnicode=needUnicode)
+ if showStructureIndent:
+ space = '\n%s' % (' ' * showStructureIndent)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if showStructureIndent:
+ s.append(space)
+ s.append('<%s%s%s>' % (self.name, attributeString, close))
+ s.append(contents)
+ if closeTag and showStructureIndent != None:
+ s.append(space)
+ s.append(closeTag)
+ s = ''.join(s)
+ isUnicode = type(s) == types.UnicodeType
+ if needUnicode and not isUnicode:
+ s = unicode(s)
+ elif isUnicode and needUnicode==False:
+ s = str(s)
+ return s
+
+ def prettify(self, needUnicode=None):
+ return self.__str__(needUnicode, showStructureIndent=True)
+
+ def renderContents(self, showStructureIndent=None, needUnicode=None):
+ """Renders the contents of this tag as a (possibly Unicode)
+ string."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
+ text = unicode(c)
+ elif isinstance(c, Tag):
+ s.append(c.__str__(needUnicode, showStructureIndent))
+ elif needUnicode:
+ text = unicode(c)
+ else:
+ text = str(c)
+ if text:
+ if showStructureIndent != None:
+ if text[-1] == '\n':
+ text = text[:-1]
+ s.append(text)
+ return ''.join(s)
+
+ #Soup methods
+
+ def firstText(self, text, recursive=True):
+ """Convenience method to retrieve the first piece of text matching the
+ given criteria. 'text' can be a string, a regular expression object,
+ a callable that takes a string and returns whether or not the
+ string 'matches', etc."""
+ return self.first(recursive=recursive, text=text)
+
+ def fetchText(self, text, recursive=True, limit=None):
+ """Convenience method to retrieve all pieces of text matching the
+ given criteria. 'text' can be a string, a regular expression object,
+ a callable that takes a string and returns whether or not the
+ string 'matches', etc."""
+ return self.fetch(recursive=recursive, text=text, limit=limit)
+
+ def first(self, name=None, attrs={}, recursive=True, text=None):
+ """Return only the first child of this
+ Tag matching the given criteria."""
+ r = Null
+ l = self.fetch(name, attrs, recursive, text, 1)
+ if l:
+ r = l[0]
+ return r
+ findChild = first
+
+ def fetch(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._fetch(name, attrs, text, limit, generator)
+ fetchChildren = fetch
+
+ #Utility methods
+
+ def isSelfClosing(self):
+ """Returns true iff this is a self-closing tag as defined in the HTML
+ standard.
+
+ TODO: This is specific to BeautifulSoup and its subclasses, but it's
+ used by __str__"""
+ return self.name in BeautifulSoup.SELF_CLOSING_TAGS
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.contents.append(tag)
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def childGenerator(self):
+ for i in range(0, len(self.contents)):
+ yield self.contents[i]
+ raise StopIteration
+
+ def recursiveChildGenerator(self):
+ stack = [(self, 0)]
+ while stack:
+ tag, start = stack.pop()
+ if isinstance(tag, Tag):
+ for i in range(start, len(tag.contents)):
+ a = tag.contents[i]
+ yield a
+ if isinstance(a, Tag) and tag.contents:
+ if i < len(tag.contents) - 1:
+ stack.append((tag, i+1))
+ stack.append((a, 0))
+ break
+ raise StopIteration
+
+
+def isList(l):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is listlike."""
+ return hasattr(l, '__iter__') \
+ or (type(l) in (types.ListType, types.TupleType))
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
+ of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif isList(portion):
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+ """This class contains the basic parser and fetch code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+
+ #As a public service we will by default silently replace MS smart quotes
+ #and similar characters with their HTML or ASCII equivalents.
+ MS_CHARS = { '\x80' : '€',
+ '\x81' : ' ',
+ '\x82' : '‚',
+ '\x83' : 'ƒ',
+ '\x84' : '„',
+ '\x85' : '…',
+ '\x86' : '†',
+ '\x87' : '‡',
+ '\x88' : '⁁',
+ '\x89' : '%',
+ '\x8A' : 'Š',
+ '\x8B' : '<',
+ '\x8C' : 'Œ',
+ '\x8D' : '?',
+ '\x8E' : 'Z',
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : '‘',
+ '\x92' : '’',
+ '\x93' : '“',
+ '\x94' : '”',
+ '\x95' : '•',
+ '\x96' : '–',
+ '\x97' : '—',
+ '\x98' : '˜',
+ '\x99' : '™',
+ '\x9a' : 'š',
+ '\x9b' : '>',
+ '\x9c' : 'œ',
+ '\x9d' : '?',
+ '\x9e' : 'z',
+ '\x9f' : 'Ÿ',}
+
+ PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda(x):x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda(x):'<!' + x.group(1) + '>'),
+ (re.compile("([\x80-\x9f])"),
+ lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
+ ]
+
+ ROOT_TAG_NAME = '[document]'
+
+ def __init__(self, text=None, avoidParserProblems=True,
+ initialTextIsEverything=True):
+ """Initialize this as the 'root tag' and feed in any text to
+ the parser.
+
+ NOTE about avoidParserProblems: sgmllib will process most bad
+ HTML, and BeautifulSoup has tricks for dealing with some HTML
+ that kills sgmllib, but Beautiful Soup can nonetheless choke
+ or lose data if your data uses self-closing tags or
+ declarations incorrectly. By default, Beautiful Soup sanitizes
+ its input to avoid the vast majority of these problems. The
+ problems are relatively rare, even in bad HTML, so feel free
+ to pass in False to avoidParserProblems if they don't apply to
+ you, and you'll get better performance. The only reason I have
+ this turned on by default is so I don't get so many tech
+ support questions.
+
+ The two most common instances of invalid HTML that will choke
+ sgmllib are fixed by the default parser massage techniques:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+ Tag.__init__(self, self.ROOT_TAG_NAME)
+ if avoidParserProblems \
+ and not isList(avoidParserProblems):
+ avoidParserProblems = self.PARSER_MASSAGE
+ self.avoidParserProblems = avoidParserProblems
+ SGMLParser.__init__(self)
+ self.quoteStack = []
+ self.hidden = 1
+ self.reset()
+ if hasattr(text, 'read'):
+ #It's a file-type object.
+ text = text.read()
+ if text:
+ self.feed(text)
+ if initialTextIsEverything:
+ self.done()
+
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
+ or methodName.find('do_') == 0:
+ return SGMLParser.__getattr__(self, methodName)
+ elif methodName.find('__') != 0:
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
+ def feed(self, text):
+ if self.avoidParserProblems:
+ for fix, m in self.avoidParserProblems:
+ text = fix.sub(m, text)
+ SGMLParser.feed(self, text)
+
+ def done(self):
+ """Called when you're done parsing, so that the unclosed tags can be
+ correctly processed."""
+ self.endData() #NEW
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def reset(self):
+ SGMLParser.reset(self)
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+ # Tags with just one string-owning child get the child as a
+ # 'string' property, so that soup.tag.string is shorthand for
+ # soup.tag.contents[0]
+ if len(self.currentTag.contents) == 1 and \
+ isinstance(self.currentTag.contents[0], NavigableText):
+ self.currentTag.string = self.currentTag.contents[0]
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self):
+ currentData = ''.join(self.currentData)
+ if currentData:
+ if not currentData.strip():
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ c = NavigableString
+ if type(currentData) == types.UnicodeType:
+ c = NavigableUnicodeString
+ o = c(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+ self.currentData = []
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+ <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
+ <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers != None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers == None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s" % name
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+ if not name in self.SELF_CLOSING_TAGS and not selfClosing:
+ self._smartPop(name)
+ tag = Tag(name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or name in self.SELF_CLOSING_TAGS:
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+
+ def unknown_endtag(self, name):
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def handle_pi(self, text):
+ "Propagate processing instructions right through."
+ self.handle_data("<?%s>" % text)
+
+ def handle_comment(self, text):
+ "Propagate comments right through."
+ self.handle_data("<!--%s-->" % text)
+
+ def handle_charref(self, ref):
+ "Propagate char refs right through."
+ self.handle_data('&#%s;' % ref)
+
+ def handle_entityref(self, ref):
+ "Propagate entity refs right through."
+ self.handle_data('&%s;' % ref)
+
+ def handle_decl(self, data):
+ "Propagate DOCTYPEs and the like right through."
+ self.handle_data('<!%s>' % data)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as regular data."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ self.handle_data(self.rawdata[i+9:k])
+ j = k+3
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup before writing your own
+ subclass."""
+
+ SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
+ QUOTE_TAGS = {'script': None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center']
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close (eg.) a 'b'
+ tag than to actually use nested 'b' tags, and the BeautifulSoup
+ class handles the common case. This class handles the
+ not-co-common case: where you can't believe someone wrote what
+ they did, but it's valid HTML and BeautifulSoup screwed up by
+ assuming it wouldn't be.
+
+ If this doesn't do what you need, try subclassing this class or
+ BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big']
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableText) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisitude,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+###
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulStoneSoup(sys.stdin.read())
+ print soup.prettify()
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_beautifulsoup.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_clientcookie.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,4 +1,4 @@
-"""HTTP cookie handling for web clients, plus some other stuff.
+"""HTTP cookie handling for web clients.
This module originally developed from my port of Gisle Aas' Perl module
HTTP::Cookies, from the libwww-perl library.
@@ -32,7 +32,7 @@
"""
-import sys, re, urlparse, string, copy, time, struct, urllib, types, logging
+import sys, re, copy, time, struct, urllib, types, logging
try:
import threading
_threading = threading; del threading
@@ -46,7 +46,8 @@
DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
from _headersutil import split_header_words, parse_ns_headers
-from _util import startswith, endswith, isstringlike, getheaders
+from _util import isstringlike
+import _rfc3986
debug = logging.getLogger("mechanize.cookies").debug
@@ -105,17 +106,17 @@
"""
# Note that, if A or B are IP addresses, the only relevant part of the
# definition of the domain-match algorithm is the direct string-compare.
- A = string.lower(A)
- B = string.lower(B)
+ A = A.lower()
+ B = B.lower()
if A == B:
return True
if not is_HDN(A):
return False
- i = string.rfind(A, B)
+ i = A.rfind(B)
has_form_nb = not (i == -1 or i == 0)
return (
has_form_nb and
- startswith(B, ".") and
+ B.startswith(".") and
is_HDN(B[1:])
)
@@ -133,15 +134,15 @@
A and B may be host domain names or IP addresses.
"""
- A = string.lower(A)
- B = string.lower(B)
+ A = A.lower()
+ B = B.lower()
if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
if A == B:
# equal IP addresses
return True
return False
- initial_dot = startswith(B, ".")
- if initial_dot and endswith(A, B):
+ initial_dot = B.startswith(".")
+ if initial_dot and A.endswith(B):
return True
if not initial_dot and A == B:
return True
@@ -156,13 +157,13 @@
"""
url = request.get_full_url()
- host = urlparse.urlparse(url)[1]
- if host == "":
+ host = _rfc3986.urlsplit(url)[1]
+ if host is None:
host = request.get_header("Host", "")
# remove port, if present
host = cut_port_re.sub("", host, 1)
- return string.lower(host)
+ return host.lower()
def eff_request_host(request):
"""Return a tuple (request-host, effective request-host name).
@@ -171,28 +172,23 @@
"""
erhn = req_host = request_host(request)
- if string.find(req_host, ".") == -1 and not IPV4_RE.search(req_host):
+ if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
erhn = req_host + ".local"
return req_host, erhn
def request_path(request):
"""request-URI, as defined by RFC 2965."""
url = request.get_full_url()
- #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
- #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
- path, parameters, query, frag = urlparse.urlparse(url)[2:]
- if parameters:
- path = "%s;%s" % (path, parameters)
+ path, query, frag = _rfc3986.urlsplit(url)[2:]
path = escape_path(path)
- req_path = urlparse.urlunparse(("", "", path, "", query, frag))
- if not startswith(req_path, "/"):
- # fix bad RFC 2396 absoluteURI
+ req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
+ if not req_path.startswith("/"):
req_path = "/"+req_path
return req_path
def request_port(request):
host = request.get_host()
- i = string.find(host, ':')
+ i = host.find(':')
if i >= 0:
port = host[i+1:]
try:
@@ -209,7 +205,7 @@
HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
def uppercase_escaped_char(match):
- return "%%%s" % string.upper(match.group(1))
+ return "%%%s" % match.group(1).upper()
def escape_path(path):
"""Escape any invalid characters in HTTP URL, and uppercase all escapes."""
# There's no knowing what character encoding was used to create URLs
@@ -252,11 +248,11 @@
'.local'
"""
- i = string.find(h, ".")
+ i = h.find(".")
if i >= 0:
#a = h[:i] # this line is only here to show what a is
b = h[i+1:]
- i = string.find(b, ".")
+ i = b.find(".")
if is_HDN(h) and (i >= 0 or b == "local"):
return "."+b
return h
@@ -344,7 +340,7 @@
self.port = port
self.port_specified = port_specified
# normalise case, as per RFC 2965 section 3.3.3
- self.domain = string.lower(domain)
+ self.domain = domain.lower()
self.domain_specified = domain_specified
# Sigh. We need to know whether the domain given in the
# cookie-attribute had an initial dot, in order to follow RFC 2965
@@ -397,7 +393,7 @@
args.append("%s=%s" % (name, repr(attr)))
args.append("rest=%s" % repr(self._rest))
args.append("rfc2109=%s" % repr(self.rfc2109))
- return "Cookie(%s)" % string.join(args, ", ")
+ return "Cookie(%s)" % ", ".join(args)
class CookiePolicy:
@@ -701,7 +697,7 @@
# Try and stop servers setting V0 cookies designed to hack other
# servers that know both V0 and V1 protocols.
if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
- startswith(cookie.name, "$")):
+ cookie.name.startswith("$")):
debug(" illegal name (starts with '$'): '%s'", cookie.name)
return False
return True
@@ -711,7 +707,7 @@
req_path = request_path(request)
if ((cookie.version > 0 or
(cookie.version == 0 and self.strict_ns_set_path)) and
- not startswith(req_path, cookie.path)):
+ not req_path.startswith(cookie.path)):
debug(" path attribute %s is not a prefix of request "
"path %s", cookie.path, req_path)
return False
@@ -728,12 +724,12 @@
domain = cookie.domain
# since domain was specified, we know that:
assert domain.startswith(".")
- if string.count(domain, ".") == 2:
+ if domain.count(".") == 2:
# domain like .foo.bar
- i = string.rfind(domain, ".")
+ i = domain.rfind(".")
tld = domain[i+1:]
sld = domain[1:i]
- if (string.lower(sld) in [
+ if (sld.lower() in [
"co", "ac",
"com", "edu", "org", "net", "gov", "mil", "int",
"aero", "biz", "cat", "coop", "info", "jobs", "mobi",
@@ -757,19 +753,19 @@
if cookie.domain_specified:
req_host, erhn = eff_request_host(request)
domain = cookie.domain
- if startswith(domain, "."):
+ if domain.startswith("."):
undotted_domain = domain[1:]
else:
undotted_domain = domain
- embedded_dots = (string.find(undotted_domain, ".") >= 0)
+ embedded_dots = (undotted_domain.find(".") >= 0)
if not embedded_dots and domain != ".local":
debug(" non-local domain %s contains no embedded dot",
domain)
return False
if cookie.version == 0:
- if (not endswith(erhn, domain) and
- (not startswith(erhn, ".") and
- not endswith("."+erhn, domain))):
+ if (not erhn.endswith(domain) and
+ (not erhn.startswith(".") and
+ not ("."+erhn).endswith(domain))):
debug(" effective request-host %s (even with added "
"initial dot) does not end end with %s",
erhn, domain)
@@ -783,7 +779,7 @@
if (cookie.version > 0 or
(self.strict_ns_domain & self.DomainStrictNoDots)):
host_prefix = req_host[:-len(domain)]
- if (string.find(host_prefix, ".") >= 0 and
+ if (host_prefix.find(".") >= 0 and
not IPV4_RE.search(req_host)):
debug(" host prefix %s for domain %s contains a dot",
host_prefix, domain)
@@ -797,7 +793,7 @@
req_port = "80"
else:
req_port = str(req_port)
- for p in string.split(cookie.port, ","):
+ for p in cookie.port.split(","):
try:
int(p)
except ValueError:
@@ -867,7 +863,7 @@
req_port = request_port(request)
if req_port is None:
req_port = "80"
- for p in string.split(cookie.port, ","):
+ for p in cookie.port.split(","):
if p == req_port:
break
else:
@@ -892,7 +888,7 @@
debug(" effective request-host name %s does not domain-match "
"RFC 2965 cookie domain %s", erhn, domain)
return False
- if cookie.version == 0 and not endswith("."+erhn, domain):
+ if cookie.version == 0 and not ("."+erhn).endswith(domain):
debug(" request-host %s does not match Netscape cookie domain "
"%s", req_host, domain)
return False
@@ -905,12 +901,12 @@
# Munge req_host and erhn to always start with a dot, so as to err on
# the side of letting cookies through.
dotted_req_host, dotted_erhn = eff_request_host(request)
- if not startswith(dotted_req_host, "."):
+ if not dotted_req_host.startswith("."):
dotted_req_host = "."+dotted_req_host
- if not startswith(dotted_erhn, "."):
+ if not dotted_erhn.startswith("."):
dotted_erhn = "."+dotted_erhn
- if not (endswith(dotted_req_host, domain) or
- endswith(dotted_erhn, domain)):
+ if not (dotted_req_host.endswith(domain) or
+ dotted_erhn.endswith(domain)):
#debug(" request domain %s does not match cookie domain %s",
# req_host, domain)
return False
@@ -927,7 +923,7 @@
def path_return_ok(self, path, request):
debug("- checking cookie path=%s", path)
req_path = request_path(request)
- if not startswith(req_path, path):
+ if not req_path.startswith(path):
debug(" %s does not path-match %s", req_path, path)
return False
return True
@@ -1096,10 +1092,10 @@
if version > 0:
if cookie.path_specified:
attrs.append('$Path="%s"' % cookie.path)
- if startswith(cookie.domain, "."):
+ if cookie.domain.startswith("."):
domain = cookie.domain
if (not cookie.domain_initial_dot and
- startswith(domain, ".")):
+ domain.startswith(".")):
domain = domain[1:]
attrs.append('$Domain="%s"' % domain)
if cookie.port is not None:
@@ -1137,8 +1133,7 @@
attrs = self._cookie_attrs(cookies)
if attrs:
if not request.has_header("Cookie"):
- request.add_unredirected_header(
- "Cookie", string.join(attrs, "; "))
+ request.add_unredirected_header("Cookie", "; ".join(attrs))
# if necessary, advertise that we know RFC 2965
if self._policy.rfc2965 and not self._policy.hide_cookie2:
@@ -1188,7 +1183,7 @@
standard = {}
rest = {}
for k, v in cookie_attrs[1:]:
- lc = string.lower(k)
+ lc = k.lower()
# don't lose case distinction for unknown fields
if lc in value_attrs or lc in boolean_attrs:
k = lc
@@ -1205,7 +1200,7 @@
bad_cookie = True
break
# RFC 2965 section 3.3.3
- v = string.lower(v)
+ v = v.lower()
if k == "expires":
if max_age_set:
# Prefer max-age to expires (like Mozilla)
@@ -1272,7 +1267,7 @@
else:
path_specified = False
path = request_path(request)
- i = string.rfind(path, "/")
+ i = path.rfind("/")
if i != -1:
if version == 0:
# Netscape spec parts company from reality here
@@ -1286,11 +1281,11 @@
# but first we have to remember whether it starts with a dot
domain_initial_dot = False
if domain_specified:
- domain_initial_dot = bool(startswith(domain, "."))
+ domain_initial_dot = bool(domain.startswith("."))
if domain is Absent:
req_host, erhn = eff_request_host(request)
domain = erhn
- elif not startswith(domain, "."):
+ elif not domain.startswith("."):
domain = "."+domain
# set default port
@@ -1365,8 +1360,8 @@
"""
# get cookie-attributes for RFC 2965 and Netscape protocols
headers = response.info()
- rfc2965_hdrs = getheaders(headers, "Set-Cookie2")
- ns_hdrs = getheaders(headers, "Set-Cookie")
+ rfc2965_hdrs = headers.getheaders("Set-Cookie2")
+ ns_hdrs = headers.getheaders("Set-Cookie")
rfc2965 = self._policy.rfc2965
netscape = self._policy.netscape
@@ -1550,12 +1545,12 @@
def __repr__(self):
r = []
for cookie in self: r.append(repr(cookie))
- return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+ return "<%s[%s]>" % (self.__class__, ", ".join(r))
def __str__(self):
r = []
for cookie in self: r.append(str(cookie))
- return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+ return "<%s[%s]>" % (self.__class__, ", ".join(r))
class LoadError(Exception): pass
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,28 @@
+import logging
+
+from urllib2 import BaseHandler
+from _response import response_seek_wrapper
+
+
+class HTTPResponseDebugProcessor(BaseHandler):
+ handler_order = 900 # before redirections, after everything else
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ info = logging.getLogger("mechanize.http_responses").info
+ try:
+ info(response.read())
+ finally:
+ response.seek(0)
+ info("*****************************************************")
+ return response
+
+ https_response = http_response
+
+class HTTPRedirectDebugProcessor(BaseHandler):
+ def http_request(self, request):
+ if hasattr(request, "redirect_dict"):
+ info = logging.getLogger("mechanize.http_redirects").info
+ info("redirecting to %s", request.get_full_url())
+ return request
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_debug.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_gzip.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,6 +1,6 @@
import urllib2
from cStringIO import StringIO
-import _util
+import _response
# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
class GzipConsumer:
@@ -65,7 +65,7 @@
def __init__(self): self.data = []
def feed(self, data): self.data.append(data)
-class stupid_gzip_wrapper(_util.closeable_response):
+class stupid_gzip_wrapper(_response.closeable_response):
def __init__(self, response):
self._response = response
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_headersutil.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,12 +9,13 @@
"""
-import os, re, string, urlparse
+import os, re
from types import StringType
from types import UnicodeType
STRING_TYPES = StringType, UnicodeType
-from _util import startswith, endswith, http2time
+from _util import http2time
+import _rfc3986
def is_html(ct_headers, url, allow_xhtml=False):
"""
@@ -24,7 +25,7 @@
"""
if not ct_headers:
# guess
- ext = os.path.splitext(urlparse.urlparse(url)[2])[1]
+ ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
html_exts = [".htm", ".html"]
if allow_xhtml:
html_exts += [".xhtml"]
@@ -113,14 +114,14 @@
if m: # unquoted value
text = unmatched(m)
value = m.group(1)
- value = string.rstrip(value)
+ value = value.rstrip()
else:
# no value, a lone token
value = None
pairs.append((name, value))
- elif startswith(string.lstrip(text), ","):
+ elif text.lstrip().startswith(","):
# concatenated headers, as per RFC 2616 section 4.2
- text = string.lstrip(text)[1:]
+ text = text.lstrip()[1:]
if pairs: result.append(pairs)
pairs = []
else:
@@ -159,8 +160,8 @@
else:
k = "%s=%s" % (k, v)
attr.append(k)
- if attr: headers.append(string.join(attr, "; "))
- return string.join(headers, ", ")
+ if attr: headers.append("; ".join(attr))
+ return ", ".join(headers)
def parse_ns_headers(ns_headers):
"""Ad-hoc parser for Netscape protocol cookie-attributes.
@@ -188,15 +189,15 @@
params = re.split(r";\s*", ns_header)
for ii in range(len(params)):
param = params[ii]
- param = string.rstrip(param)
+ param = param.rstrip()
if param == "": continue
if "=" not in param:
k, v = param, None
else:
k, v = re.split(r"\s*=\s*", param, 1)
- k = string.lstrip(k)
+ k = k.lstrip()
if ii != 0:
- lc = string.lower(k)
+ lc = k.lower()
if lc in known_attrs:
k = lc
if k == "version":
@@ -204,8 +205,8 @@
version_set = True
if k == "expires":
# convert expires date to seconds since epoch
- if startswith(v, '"'): v = v[1:]
- if endswith(v, '"'): v = v[:-1]
+ if v.startswith('"'): v = v[1:]
+ if v.endswith('"'): v = v[:-1]
v = http2time(v) # None if invalid
pairs.append((k, v))
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_html.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -8,97 +8,42 @@
"""
-import re, copy, urllib, htmlentitydefs
-from urlparse import urljoin
+import re, copy, htmlentitydefs
+import sgmllib, HTMLParser, ClientForm
import _request
from _headersutil import split_header_words, is_html as _is_html
+import _rfc3986
-## # XXXX miserable hack
-## def urljoin(base, url):
-## if url.startswith("?"):
-## return base+url
-## else:
-## return urlparse.urljoin(base, url)
+DEFAULT_ENCODING = "latin-1"
-## def chr_range(a, b):
-## return "".join(map(chr, range(ord(a), ord(b)+1)))
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-## "abcdefghijklmnopqrstuvwxyz"
-## "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+# the base classe is purely for backwards compatibility
+class ParseError(ClientForm.ParseError): pass
-DEFAULT_ENCODING = "latin-1"
class CachingGeneratorFunction(object):
- """Caching wrapper around a no-arguments iterable.
+ """Caching wrapper around a no-arguments iterable."""
- >>> i = [1]
- >>> func = CachingGeneratorFunction(i)
- >>> list(func())
- [1]
- >>> list(func())
- [1]
-
- >>> i = [1, 2, 3]
- >>> func = CachingGeneratorFunction(i)
- >>> list(func())
- [1, 2, 3]
-
- >>> i = func()
- >>> i.next()
- 1
- >>> i.next()
- 2
- >>> i.next()
- 3
-
- >>> i = func()
- >>> j = func()
- >>> i.next()
- 1
- >>> j.next()
- 1
- >>> i.next()
- 2
- >>> j.next()
- 2
- >>> j.next()
- 3
- >>> i.next()
- 3
- >>> i.next()
- Traceback (most recent call last):
- ...
- StopIteration
- >>> j.next()
- Traceback (most recent call last):
- ...
- StopIteration
- """
def __init__(self, iterable):
- def make_gen():
- for item in iterable:
- yield item
-
self._cache = []
- self._generator = make_gen()
+ # wrap iterable to make it non-restartable (otherwise, repeated
+ # __call__ would give incorrect results)
+ self._iterator = iter(iterable)
def __call__(self):
cache = self._cache
-
for item in cache:
yield item
- for item in self._generator:
+ for item in self._iterator:
cache.append(item)
yield item
-def encoding_finder(default_encoding):
- def encoding(response):
+
+class EncodingFinder:
+ def __init__(self, default_encoding):
+ self._default_encoding = default_encoding
+ def encoding(self, response):
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
# headers may be in the response. HTTP-EQUIV headers come last,
# so try in order from first to last.
@@ -106,17 +51,18 @@
for k, v in split_header_words([ct])[0]:
if k == "charset":
return v
- return default_encoding
- return encoding
+ return self._default_encoding
-def make_is_html(allow_xhtml):
- def is_html(response, encoding):
+class ResponseTypeFinder:
+ def __init__(self, allow_xhtml):
+ self._allow_xhtml = allow_xhtml
+ def is_html(self, response, encoding):
ct_hdrs = response.info().getheaders("content-type")
url = response.geturl()
# XXX encoding
- return _is_html(ct_hdrs, url, allow_xhtml)
- return is_html
+ return _is_html(ct_hdrs, url, self._allow_xhtml)
+
# idea for this argument-processing trick is from Peter Otten
class Args:
def __init__(self, args_map):
@@ -140,7 +86,7 @@
def __init__(self, base_url, url, text, tag, attrs):
assert None not in [url, tag, attrs]
self.base_url = base_url
- self.absolute_url = urljoin(base_url, url)
+ self.absolute_url = _rfc3986.urljoin(base_url, url)
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
def __cmp__(self, other):
try:
@@ -155,19 +101,6 @@
self.base_url, self.url, self.text, self.tag, self.attrs)
-def clean_url(url, encoding):
- # percent-encode illegal URL characters
- # Trying to come up with test cases for this gave me a headache, revisit
- # when do switch to unicode.
- # Somebody else's comments (lost the attribution):
-## - IE will return you the url in the encoding you send it
-## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
-## characters in your link. It will send you utf-8 however if there are...
- if type(url) == type(""):
- url = url.decode(encoding, "replace")
- url = url.strip()
- return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
-
class LinksFactory:
def __init__(self,
@@ -203,40 +136,49 @@
base_url = self._base_url
p = self.link_parser_class(response, encoding=encoding)
- for token in p.tags(*(self.urltags.keys()+["base"])):
- if token.data == "base":
- base_url = dict(token.attrs).get("href")
- continue
- if token.type == "endtag":
- continue
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get("name")
- text = None
- # XXX use attr_encoding for ref'd doc if that doc does not provide
- # one by other means
- #attr_encoding = attrs.get("charset")
- url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
- if not url:
- # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
- # For our purposes a link is something with a URL, so ignore
- # this.
- continue
+ try:
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.type == "endtag":
+ continue
+ if token.data == "base":
+ base_href = dict(token.attrs).get("href")
+ if base_href is not None:
+ base_url = base_href
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX use attr_encoding for ref'd doc if that doc does not
+ # provide one by other means
+ #attr_encoding = attrs.get("charset")
+ url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so
+ # ignore this.
+ continue
- url = clean_url(url, encoding)
- if tag == "a":
- if token.type != "startendtag":
- # hmm, this'd break if end tag is missing
- text = p.get_compressed_text(("endtag", tag))
- # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
- #text = p.get_compressed_text()
+ url = _rfc3986.clean_url(url, encoding)
+ if tag == "a":
+ if token.type != "startendtag":
+ # hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg.
+ # <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
- yield Link(base_url, url, text, tag, token.attrs)
+ yield Link(base_url, url, text, tag, token.attrs)
+ except sgmllib.SGMLParseError, exc:
+ raise ParseError(exc)
class FormsFactory:
"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
+ After calling .forms(), the .global_form attribute is a form object
+ containing all controls not a descendant of any FORM element.
+
For constructor argument docs, see ClientForm.ParseResponse
argument docs.
@@ -259,22 +201,31 @@
self.backwards_compat = backwards_compat
self._response = None
self.encoding = None
+ self.global_form = None
def set_response(self, response, encoding):
self._response = response
self.encoding = encoding
+ self.global_form = None
def forms(self):
import ClientForm
encoding = self.encoding
- return ClientForm.ParseResponse(
- self._response,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- encoding=encoding,
- )
+ try:
+ forms = ClientForm.ParseResponseEx(
+ self._response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ encoding=encoding,
+ _urljoin=_rfc3986.urljoin,
+ _urlparse=_rfc3986.urlsplit,
+ _urlunparse=_rfc3986.urlunsplit,
+ )
+ except ClientForm.ParseError, exc:
+ raise ParseError(exc)
+ self.global_form = forms[0]
+ return forms[1:]
class TitleFactory:
def __init__(self):
@@ -289,11 +240,14 @@
p = _pullparser.TolerantPullParser(
self._response, encoding=self._encoding)
try:
- p.get_tag("title")
- except _pullparser.NoMoreTokensError:
- return None
- else:
- return p.get_text()
+ try:
+ p.get_tag("title")
+ except _pullparser.NoMoreTokensError:
+ return None
+ else:
+ return p.get_text()
+ except sgmllib.SGMLParseError, exc:
+ raise ParseError(exc)
def unescape(data, entities, encoding):
@@ -334,42 +288,44 @@
return repl
-try:
- import BeautifulSoup
-except ImportError:
- pass
-else:
- import sgmllib
- # monkeypatch to fix http://www.python.org/sf/803422 :-(
- sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
- class MechanizeBs(BeautifulSoup.BeautifulSoup):
- _entitydefs = htmlentitydefs.name2codepoint
- # don't want the magic Microsoft-char workaround
- PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
- lambda(x):x.group(1) + ' />'),
- (re.compile('<!\s+([^<>]*)>'),
- lambda(x):'<!' + x.group(1) + '>')
- ]
+# bizarre import gymnastics for bundled BeautifulSoup
+import _beautifulsoup
+import ClientForm
+RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
+ _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
+ )
+# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
+import sgmllib
+sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
- def __init__(self, encoding, text=None, avoidParserProblems=True,
- initialTextIsEverything=True):
- self._encoding = encoding
- BeautifulSoup.BeautifulSoup.__init__(
- self, text, avoidParserProblems, initialTextIsEverything)
+class MechanizeBs(_beautifulsoup.BeautifulSoup):
+ _entitydefs = htmlentitydefs.name2codepoint
+ # don't want the magic Microsoft-char workaround
+ PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda(x):x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda(x):'<!' + x.group(1) + '>')
+ ]
- def handle_charref(self, ref):
- t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
- self.handle_data(t)
- def handle_entityref(self, ref):
- t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
- self.handle_data(t)
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- val = unescape(val, self._entitydefs, self._encoding)
- escaped_attrs.append((key, val))
- return escaped_attrs
+ def __init__(self, encoding, text=None, avoidParserProblems=True,
+ initialTextIsEverything=True):
+ self._encoding = encoding
+ _beautifulsoup.BeautifulSoup.__init__(
+ self, text, avoidParserProblems, initialTextIsEverything)
+ def handle_charref(self, ref):
+ t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def handle_entityref(self, ref):
+ t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ val = unescape(val, self._entitydefs, self._encoding)
+ escaped_attrs.append((key, val))
+ return escaped_attrs
+
class RobustLinksFactory:
compress_re = re.compile(r"\s+")
@@ -379,7 +335,7 @@
link_class=Link,
urltags=None,
):
- import BeautifulSoup
+ import _beautifulsoup
if link_parser_class is None:
link_parser_class = MechanizeBs
self.link_parser_class = link_parser_class
@@ -402,27 +358,29 @@
self._encoding = encoding
def links(self):
- import BeautifulSoup
+ import _beautifulsoup
bs = self._bs
base_url = self._base_url
encoding = self._encoding
gen = bs.recursiveChildGenerator()
for ch in bs.recursiveChildGenerator():
- if (isinstance(ch, BeautifulSoup.Tag) and
+ if (isinstance(ch, _beautifulsoup.Tag) and
ch.name in self.urltags.keys()+["base"]):
link = ch
attrs = bs.unescape_attrs(link.attrs)
attrs_dict = dict(attrs)
if link.name == "base":
- base_url = attrs_dict.get("href")
+ base_href = attrs_dict.get("href")
+ if base_href is not None:
+ base_url = base_href
continue
url_attr = self.urltags[link.name]
url = attrs_dict.get(url_attr)
if not url:
continue
- url = clean_url(url, encoding)
+ url = _rfc3986.clean_url(url, encoding)
text = link.firstText(lambda t: True)
- if text is BeautifulSoup.Null:
+ if text is _beautifulsoup.Null:
# follow _pullparser's weird behaviour rigidly
if link.name == "a":
text = ""
@@ -438,7 +396,7 @@
import ClientForm
args = form_parser_args(*args, **kwds)
if args.form_parser_class is None:
- args.form_parser_class = ClientForm.RobustFormParser
+ args.form_parser_class = RobustFormParser
FormsFactory.__init__(self, **args.dictionary)
def set_response(self, response, encoding):
@@ -454,10 +412,10 @@
self._bs = soup
self._encoding = encoding
- def title(soup):
- import BeautifulSoup
+ def title(self):
+ import _beautifulsoup
title = self._bs.first("title")
- if title == BeautifulSoup.Null:
+ if title == _beautifulsoup.Null:
return None
else:
return title.firstText(lambda t: True)
@@ -477,18 +435,25 @@
Public attributes:
+ Note that accessing these attributes may raise ParseError.
+
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
is_html: true if response contains an HTML document (XHTML may be
regarded as HTML too)
title: page title, or None if no title or not HTML
+ global_form: form object containing all controls that are not descendants
+ of any FORM element, or None if the forms_factory does not support
+ supplying a global form
"""
+ LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
+
def __init__(self, forms_factory, links_factory, title_factory,
- get_encoding=encoding_finder(DEFAULT_ENCODING),
- is_html_p=make_is_html(allow_xhtml=False),
+ encoding_finder=EncodingFinder(DEFAULT_ENCODING),
+ response_type_finder=ResponseTypeFinder(allow_xhtml=False),
):
"""
@@ -504,8 +469,8 @@
self._forms_factory = forms_factory
self._links_factory = links_factory
self._title_factory = title_factory
- self._get_encoding = get_encoding
- self._is_html_p = is_html_p
+ self._encoding_finder = encoding_finder
+ self._response_type_finder = response_type_finder
self.set_response(None)
@@ -521,51 +486,71 @@
def set_response(self, response):
"""Set response.
- The response must implement the same interface as objects returned by
- urllib2.urlopen().
+ The response must either be None or implement the same interface as
+ objects returned by urllib2.urlopen().
"""
self._response = response
self._forms_genf = self._links_genf = None
self._get_title = None
- for name in ["encoding", "is_html", "title"]:
+ for name in self.LAZY_ATTRS:
try:
delattr(self, name)
except AttributeError:
pass
def __getattr__(self, name):
- if name not in ["encoding", "is_html", "title"]:
+ if name not in self.LAZY_ATTRS:
return getattr(self.__class__, name)
- try:
- if name == "encoding":
- self.encoding = self._get_encoding(self._response)
- return self.encoding
- elif name == "is_html":
- self.is_html = self._is_html_p(self._response, self.encoding)
- return self.is_html
- elif name == "title":
- if self.is_html:
- self.title = self._title_factory.title()
- else:
- self.title = None
- return self.title
- finally:
- self._response.seek(0)
+ if name == "encoding":
+ self.encoding = self._encoding_finder.encoding(
+ copy.copy(self._response))
+ return self.encoding
+ elif name == "is_html":
+ self.is_html = self._response_type_finder.is_html(
+ copy.copy(self._response), self.encoding)
+ return self.is_html
+ elif name == "title":
+ if self.is_html:
+ self.title = self._title_factory.title()
+ else:
+ self.title = None
+ return self.title
+ elif name == "global_form":
+ self.forms()
+ return self.global_form
def forms(self):
- """Return iterable over ClientForm.HTMLForm-like objects."""
+ """Return iterable over ClientForm.HTMLForm-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
+ # this implementation sets .global_form as a side-effect, for benefit
+ # of __getattr__ impl
if self._forms_genf is None:
- self._forms_genf = CachingGeneratorFunction(
- self._forms_factory.forms())
+ try:
+ self._forms_genf = CachingGeneratorFunction(
+ self._forms_factory.forms())
+ except: # XXXX define exception!
+ self.set_response(self._response)
+ raise
+ self.global_form = getattr(
+ self._forms_factory, "global_form", None)
return self._forms_genf()
def links(self):
- """Return iterable over mechanize.Link-like objects."""
+ """Return iterable over mechanize.Link-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
if self._links_genf is None:
- self._links_genf = CachingGeneratorFunction(
- self._links_factory.links())
+ try:
+ self._links_genf = CachingGeneratorFunction(
+ self._links_factory.links())
+ except: # XXXX define exception!
+ self.set_response(self._response)
+ raise
return self._links_genf()
class DefaultFactory(Factory):
@@ -576,7 +561,8 @@
forms_factory=FormsFactory(),
links_factory=LinksFactory(),
title_factory=TitleFactory(),
- is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ response_type_finder=ResponseTypeFinder(
+ allow_xhtml=i_want_broken_xhtml_support),
)
def set_response(self, response):
@@ -585,7 +571,7 @@
self._forms_factory.set_response(
copy.copy(response), self.encoding)
self._links_factory.set_response(
- copy.copy(response), self._response.geturl(), self.encoding)
+ copy.copy(response), response.geturl(), self.encoding)
self._title_factory.set_response(
copy.copy(response), self.encoding)
@@ -601,19 +587,21 @@
forms_factory=RobustFormsFactory(),
links_factory=RobustLinksFactory(),
title_factory=RobustTitleFactory(),
- is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ response_type_finder=ResponseTypeFinder(
+ allow_xhtml=i_want_broken_xhtml_support),
)
if soup_class is None:
soup_class = MechanizeBs
self._soup_class = soup_class
def set_response(self, response):
- import BeautifulSoup
+ import _beautifulsoup
Factory.set_response(self, response)
if response is not None:
data = response.read()
soup = self._soup_class(self.encoding, data)
- self._forms_factory.set_response(response, self.encoding)
+ self._forms_factory.set_response(
+ copy.copy(response), self.encoding)
self._links_factory.set_soup(
soup, response.geturl(), self.encoding)
self._title_factory.set_soup(soup, self.encoding)
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,729 @@
+"""HTTP related handlers.
+
+Note that some other HTTP handlers live in more specific modules: _auth.py,
+_gzip.py, etc.
+
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
+ urllib2, urllib, httplib, sgmllib
+from urllib2 import URLError, HTTPError, BaseHandler
+from cStringIO import StringIO
+
+from _request import Request
+from _util import isstringlike
+from _response import closeable_response, response_seek_wrapper
+from _html import unescape, unescape_charref
+from _headersutil import is_html
+from _clientcookie import CookieJar, request_host
+import _rfc3986
+
+debug = logging.getLogger("mechanize").debug
+
+# monkeypatch urllib2.HTTPError to show URL
+## def urllib2_str(self):
+## return 'HTTP Error %s: %s (%s)' % (
+## self.code, self.msg, self.geturl())
+## urllib2.HTTPError.__str__ = urllib2_str
+
+
+CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
+DEFAULT_ENCODING = 'latin-1'
+
+
+# This adds "refresh" to the list of redirectables and provides a redirection
+# algorithm that doesn't go into a loop in the presence of cookies
+# (Python 2.4 has this new algorithm, 2.3 doesn't).
+class HTTPRedirectHandler(BaseHandler):
+ # maximum number of redirections to any single URL
+ # this is needed because of the state that cookies introduce
+ max_repeats = 4
+ # maximum total number of redirections (regardless of URL) before
+ # assuming we're in a loop
+ max_redirections = 10
+
+ # Implementation notes:
+
+ # To avoid the server sending us into an infinite loop, the request
+ # object needs to track what URLs we have already seen. Do this by
+ # adding a handler-specific attribute to the Request object. The value
+ # of the dict is used to count the number of times the same URL has
+ # been visited. This is needed because visiting the same URL twice
+ # does not necessarily imply a loop, thanks to state introduced by
+ # cookies.
+
+ # Always unhandled redirection codes:
+ # 300 Multiple Choices: should not handle this here.
+ # 304 Not Modified: no need to handle here: only of interest to caches
+ # that do conditional GETs
+ # 305 Use Proxy: probably not worth dealing with here
+ # 306 Unused: what was this for in the previous versions of protocol??
+
+ def redirect_request(self, newurl, req, fp, code, msg, headers):
+ """Return a Request or None in response to a redirect.
+
+ This is called by the http_error_30x methods when a redirection
+ response is received. If a redirection should take place, return a
+ new Request to allow http_error_30x to perform the redirect;
+ otherwise, return None to indicate that an HTTPError should be
+ raised.
+
+ """
+ if code in (301, 302, 303, "refresh") or \
+ (code == 307 and not req.has_data()):
+ # Strictly (according to RFC 2616), 301 or 302 in response to
+ # a POST MUST NOT cause a redirection without confirmation
+ # from the user (of urllib2, in this case). In practice,
+ # essentially all clients do redirect in this case, so we do
+ # the same.
+ # XXX really refresh redirections should be visiting; tricky to
+ # fix, so this will wait until post-stable release
+ new = Request(newurl,
+ headers=req.headers,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True,
+ visit=False,
+ )
+ new._origin_req = getattr(req, "_origin_req", req)
+ return new
+ else:
+ raise HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+ def http_error_302(self, req, fp, code, msg, headers):
+ # Some servers (incorrectly) return multiple Location headers
+ # (so probably same goes for URI). Use first header.
+ if headers.has_key('location'):
+ newurl = headers.getheaders('location')[0]
+ elif headers.has_key('uri'):
+ newurl = headers.getheaders('uri')[0]
+ else:
+ return
+ newurl = _rfc3986.clean_url(newurl, "latin-1")
+ newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
+
+ # XXX Probably want to forget about the state of the current
+ # request, although that might interact poorly with other
+ # handlers that also use handler-specific request attributes
+ new = self.redirect_request(newurl, req, fp, code, msg, headers)
+ if new is None:
+ return
+
+ # loop detection
+ # .redirect_dict has a key url if url was previously visited.
+ if hasattr(req, 'redirect_dict'):
+ visited = new.redirect_dict = req.redirect_dict
+ if (visited.get(newurl, 0) >= self.max_repeats or
+ len(visited) >= self.max_redirections):
+ raise HTTPError(req.get_full_url(), code,
+ self.inf_msg + msg, headers, fp)
+ else:
+ visited = new.redirect_dict = req.redirect_dict = {}
+ visited[newurl] = visited.get(newurl, 0) + 1
+
+ # Don't close the fp until we are sure that we won't use it
+ # with HTTPError.
+ fp.read()
+ fp.close()
+
+ return self.parent.open(new)
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_302
+ http_error_refresh = http_error_302
+
+ inf_msg = "The HTTP server returned a redirect error that would " \
+ "lead to an infinite loop.\n" \
+ "The last 30x error message was:\n"
+
+
+# XXX would self.reset() work, instead of raising this exception?
+class EndOfHeadError(Exception): pass
+class AbstractHeadParser:
+ # only these elements are allowed in or before HEAD of document
+ head_elems = ("html", "head",
+ "title", "base",
+ "script", "style", "meta", "link", "object")
+ _entitydefs = htmlentitydefs.name2codepoint
+ _encoding = DEFAULT_ENCODING
+
+ def __init__(self):
+ self.http_equiv = []
+
+ def start_meta(self, attrs):
+ http_equiv = content = None
+ for key, value in attrs:
+ if key == "http-equiv":
+ http_equiv = self.unescape_attr_if_required(value)
+ elif key == "content":
+ content = self.unescape_attr_if_required(value)
+ if http_equiv is not None and content is not None:
+ self.http_equiv.append((http_equiv, content))
+
+ def end_head(self):
+ raise EndOfHeadError()
+
+ def handle_entityref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape(
+ '&%s;' % name, self._entitydefs, self._encoding))
+
+ def handle_charref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape_charref(name, self._encoding))
+
+ def unescape_attr(self, name):
+ #debug("%s", name)
+ return unescape(name, self._entitydefs, self._encoding)
+
+ def unescape_attrs(self, attrs):
+ #debug("%s", attrs)
+ escaped_attrs = {}
+ for key, val in attrs.items():
+ escaped_attrs[key] = self.unescape_attr(val)
+ return escaped_attrs
+
+ def unknown_entityref(self, ref):
+ self.handle_data("&%s;" % ref)
+
+ def unknown_charref(self, ref):
+ self.handle_data("&#%s;" % ref)
+
+
+try:
+ import HTMLParser
+except ImportError:
+ pass
+else:
+ class XHTMLCompatibleHeadParser(AbstractHeadParser,
+ HTMLParser.HTMLParser):
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'start_' + tag)
+ except AttributeError:
+ try:
+ method = getattr(self, 'do_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method(attrs)
+ else:
+ method(attrs)
+
+ def handle_endtag(self, tag):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'end_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method()
+
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+ def unescape_attr_if_required(self, name):
+ return name # HTMLParser.HTMLParser already did it
+
+class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
+
+ def _not_called(self):
+ assert False
+
+ def __init__(self):
+ sgmllib.SGMLParser.__init__(self)
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, method, attrs):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ if tag == "meta":
+ method(attrs)
+
+ def unknown_starttag(self, tag, attrs):
+ self.handle_starttag(tag, self._not_called, attrs)
+
+ def handle_endtag(self, tag, method):
+ if tag in self.head_elems:
+ method()
+ else:
+ raise EndOfHeadError()
+
+ def unescape_attr_if_required(self, name):
+ return self.unescape_attr(name)
+
+def parse_head(fileobj, parser):
+ """Return a list of key, value pairs."""
+ while 1:
+ data = fileobj.read(CHUNK)
+ try:
+ parser.feed(data)
+ except EndOfHeadError:
+ break
+ if len(data) != CHUNK:
+ # this should only happen if there is no HTML body, or if
+ # CHUNK is big
+ break
+ return parser.http_equiv
+
+class HTTPEquivProcessor(BaseHandler):
+ """Append META HTTP-EQUIV headers to regular HTTP headers."""
+
+ handler_order = 300 # before handlers that look at HTTP headers
+
+ def __init__(self, head_parser_class=HeadParser,
+ i_want_broken_xhtml_support=False,
+ ):
+ self.head_parser_class = head_parser_class
+ self._allow_xhtml = i_want_broken_xhtml_support
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ http_message = response.info()
+ url = response.geturl()
+ ct_hdrs = http_message.getheaders("content-type")
+ if is_html(ct_hdrs, url, self._allow_xhtml):
+ try:
+ try:
+ html_headers = parse_head(response, self.head_parser_class())
+ finally:
+ response.seek(0)
+ except (HTMLParser.HTMLParseError,
+ sgmllib.SGMLParseError):
+ pass
+ else:
+ for hdr, val in html_headers:
+ # add a header
+ http_message.dict[hdr.lower()] = val
+ text = hdr + ": " + val
+ for line in text.split("\n"):
+ http_message.headers.append(line + "\n")
+ return response
+
+ https_response = http_response
+
+class HTTPCookieProcessor(BaseHandler):
+ """Handle HTTP cookies.
+
+ Public attributes:
+
+ cookiejar: CookieJar instance
+
+ """
+ def __init__(self, cookiejar=None):
+ if cookiejar is None:
+ cookiejar = CookieJar()
+ self.cookiejar = cookiejar
+
+ def http_request(self, request):
+ self.cookiejar.add_cookie_header(request)
+ return request
+
+ def http_response(self, request, response):
+ self.cookiejar.extract_cookies(response, request)
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+try:
+ import robotparser
+except ImportError:
+ pass
+else:
+ class MechanizeRobotFileParser(robotparser.RobotFileParser):
+
+ def __init__(self, url='', opener=None):
+ import _opener
+ robotparser.RobotFileParser.__init__(self, url)
+ self._opener = opener
+
+ def set_opener(self, opener=None):
+ if opener is None:
+ opener = _opener.OpenerDirector()
+ self._opener = opener
+
+ def read(self):
+ """Reads the robots.txt URL and feeds it to the parser."""
+ if self._opener is None:
+ self.set_opener()
+ req = Request(self.url, unverifiable=True, visit=False)
+ try:
+ f = self._opener.open(req)
+ except HTTPError, f:
+ pass
+ except (IOError, socket.error, OSError), exc:
+ robotparser._debug("ignoring error opening %r: %s" %
+ (self.url, exc))
+ return
+ lines = []
+ line = f.readline()
+ while line:
+ lines.append(line.strip())
+ line = f.readline()
+ status = f.code
+ if status == 401 or status == 403:
+ self.disallow_all = True
+ robotparser._debug("disallow all")
+ elif status >= 400:
+ self.allow_all = True
+ robotparser._debug("allow all")
+ elif status == 200 and lines:
+ robotparser._debug("parse lines")
+ self.parse(lines)
+
+ class RobotExclusionError(urllib2.HTTPError):
+ def __init__(self, request, *args):
+ apply(urllib2.HTTPError.__init__, (self,)+args)
+ self.request = request
+
+ class HTTPRobotRulesProcessor(BaseHandler):
+ # before redirections, after everything else
+ handler_order = 800
+
+ try:
+ from httplib import HTTPMessage
+ except:
+ from mimetools import Message
+ http_response_class = Message
+ else:
+ http_response_class = HTTPMessage
+
+ def __init__(self, rfp_class=MechanizeRobotFileParser):
+ self.rfp_class = rfp_class
+ self.rfp = None
+ self._host = None
+
+ def http_request(self, request):
+ scheme = request.get_type()
+ if scheme not in ["http", "https"]:
+ # robots exclusion only applies to HTTP
+ return request
+
+ if request.get_selector() == "/robots.txt":
+ # /robots.txt is always OK to fetch
+ return request
+
+ host = request.get_host()
+
+ # robots.txt requests don't need to be allowed by robots.txt :-)
+ origin_req = getattr(request, "_origin_req", None)
+ if (origin_req is not None and
+ origin_req.get_selector() == "/robots.txt" and
+ origin_req.get_host() == host
+ ):
+ return request
+
+ if host != self._host:
+ self.rfp = self.rfp_class()
+ try:
+ self.rfp.set_opener(self.parent)
+ except AttributeError:
+ debug("%r instance does not support set_opener" %
+ self.rfp.__class__)
+ self.rfp.set_url(scheme+"://"+host+"/robots.txt")
+ self.rfp.read()
+ self._host = host
+
+ ua = request.get_header("User-agent", "")
+ if self.rfp.can_fetch(ua, request.get_full_url()):
+ return request
+ else:
+ # XXX This should really have raised URLError. Too late now...
+ msg = "request disallowed by robots.txt"
+ raise RobotExclusionError(
+ request,
+ request.get_full_url(),
+ 403, msg,
+ self.http_response_class(StringIO()), StringIO(msg))
+
+ https_request = http_request
+
+class HTTPRefererProcessor(BaseHandler):
+ """Add Referer header to requests.
+
+ This only makes sense if you use each RefererProcessor for a single
+ chain of requests only (so, for example, if you use a single
+ HTTPRefererProcessor to fetch a series of URLs extracted from a single
+ page, this will break).
+
+ There's a proper implementation of this in mechanize.Browser.
+
+ """
+ def __init__(self):
+ self.referer = None
+
+ def http_request(self, request):
+ if ((self.referer is not None) and
+ not request.has_header("Referer")):
+ request.add_unredirected_header("Referer", self.referer)
+ return request
+
+ def http_response(self, request, response):
+ self.referer = response.geturl()
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+
+def clean_refresh_url(url):
+ # e.g. Firefox 1.5 does (something like) this
+ if ((url.startswith('"') and url.endswith('"')) or
+ (url.startswith("'") and url.endswith("'"))):
+ url = url[1:-1]
+ return _rfc3986.clean_url(url, "latin-1") # XXX encoding
+
+def parse_refresh_header(refresh):
+ """
+ >>> parse_refresh_header("1; url=http://example.com/")
+ (1.0, 'http://example.com/')
+ >>> parse_refresh_header("1; url='http://example.com/'")
+ (1.0, 'http://example.com/')
+ >>> parse_refresh_header("1")
+ (1.0, None)
+ >>> parse_refresh_header("blah")
+ Traceback (most recent call last):
+ ValueError: invalid literal for float(): blah
+
+ """
+
+ ii = refresh.find(";")
+ if ii != -1:
+ pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
+ jj = newurl_spec.find("=")
+ key = None
+ if jj != -1:
+ key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
+ newurl = clean_refresh_url(newurl)
+ if key is None or key.strip().lower() != "url":
+ raise ValueError()
+ else:
+ pause, newurl = float(refresh), None
+ return pause, newurl
+
+class HTTPRefreshProcessor(BaseHandler):
+ """Perform HTTP Refresh redirections.
+
+ Note that if a non-200 HTTP code has occurred (for example, a 30x
+ redirect), this processor will do nothing.
+
+ By default, only zero-time Refresh headers are redirected. Use the
+ max_time attribute / constructor argument to allow Refresh with longer
+ pauses. Use the honor_time attribute / constructor argument to control
+ whether the requested pause is honoured (with a time.sleep()) or
+ skipped in favour of immediate redirection.
+
+ Public attributes:
+
+ max_time: see above
+ honor_time: see above
+
+ """
+ handler_order = 1000
+
+ def __init__(self, max_time=0, honor_time=True):
+ self.max_time = max_time
+ self.honor_time = honor_time
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code == 200 and hdrs.has_key("refresh"):
+ refresh = hdrs.getheaders("refresh")[0]
+ try:
+ pause, newurl = parse_refresh_header(refresh)
+ except ValueError:
+ debug("bad Refresh header: %r" % refresh)
+ return response
+ if newurl is None:
+ newurl = response.geturl()
+ if (self.max_time is None) or (pause <= self.max_time):
+ if pause > 1E-3 and self.honor_time:
+ time.sleep(pause)
+ hdrs["location"] = newurl
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response,
+ "refresh", msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+class HTTPErrorProcessor(BaseHandler):
+ """Process HTTP error responses.
+
+ The purpose of this handler is to to allow other response processors a
+ look-in by removing the call to parent.error() from
+ AbstractHTTPHandler.
+
+ For non-200 error codes, this just passes the job on to the
+ Handler.<proto>_error_<code> methods, via the OpenerDirector.error
+ method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
+ HTTPError if no other handler handles the error.
+
+ """
+ handler_order = 1000 # after all other processors
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code != 200:
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response, code, msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+
+class HTTPDefaultErrorHandler(BaseHandler):
+ def http_error_default(self, req, fp, code, msg, hdrs):
+ # why these error methods took the code, msg, headers args in the first
+ # place rather than a response object, I don't know, but to avoid
+ # multiple wrapping, we're discarding them
+
+ if isinstance(fp, urllib2.HTTPError):
+ response = fp
+ else:
+ response = urllib2.HTTPError(
+ req.get_full_url(), code, msg, hdrs, fp)
+ assert code == response.code
+ assert msg == response.msg
+ assert hdrs == response.hdrs
+ raise response
+
+
+class AbstractHTTPHandler(BaseHandler):
+
+ def __init__(self, debuglevel=0):
+ self._debuglevel = debuglevel
+
+ def set_http_debuglevel(self, level):
+ self._debuglevel = level
+
+ def do_request_(self, request):
+ host = request.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ if request.has_data(): # POST
+ data = request.get_data()
+ if not request.has_header('Content-type'):
+ request.add_unredirected_header(
+ 'Content-type',
+ 'application/x-www-form-urlencoded')
+
+ scheme, sel = urllib.splittype(request.get_selector())
+ sel_host, sel_path = urllib.splithost(sel)
+ if not request.has_header('Host'):
+ request.add_unredirected_header('Host', sel_host or host)
+ for name, value in self.parent.addheaders:
+ name = name.capitalize()
+ if not request.has_header(name):
+ request.add_unredirected_header(name, value)
+
+ return request
+
+ def do_open(self, http_class, req):
+ """Return an addinfourl object for the request, using http_class.
+
+ http_class must implement the HTTPConnection API from httplib.
+ The addinfourl return value is a file-like object. It also
+ has methods and attributes including:
+ - info(): return a mimetools.Message object for the headers
+ - geturl(): return the original request URL
+ - code: HTTP status code
+ """
+ host = req.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ h = http_class(host) # will parse host:port
+ h.set_debuglevel(self._debuglevel)
+
+ headers = dict(req.headers)
+ headers.update(req.unredirected_hdrs)
+ # We want to make an HTTP/1.1 request, but the addinfourl
+ # class isn't prepared to deal with a persistent connection.
+ # It will try to read all remaining data from the socket,
+ # which will block while the server waits for the next request.
+ # So make sure the connection gets closed after the (only)
+ # request.
+ headers["Connection"] = "close"
+ headers = dict(
+ [(name.title(), val) for name, val in headers.items()])
+ try:
+ h.request(req.get_method(), req.get_selector(), req.data, headers)
+ r = h.getresponse()
+ except socket.error, err: # XXX what error?
+ raise URLError(err)
+
+ # Pick apart the HTTPResponse object to get the addinfourl
+ # object initialized properly.
+
+ # Wrap the HTTPResponse object in socket's file object adapter
+ # for Windows. That adapter calls recv(), so delegate recv()
+ # to read(). This weird wrapping allows the returned object to
+ # have readline() and readlines() methods.
+
+ # XXX It might be better to extract the read buffering code
+ # out of socket._fileobject() and into a base class.
+
+ r.recv = r.read
+ fp = socket._fileobject(r)
+
+ resp = closeable_response(fp, r.msg, req.get_full_url(),
+ r.status, r.reason)
+ return resp
+
+
+class HTTPHandler(AbstractHTTPHandler):
+ def http_open(self, req):
+ return self.do_open(httplib.HTTPConnection, req)
+
+ http_request = AbstractHTTPHandler.do_request_
+
+if hasattr(httplib, 'HTTPS'):
+
+ class HTTPSConnectionFactory:
+ def __init__(self, key_file, cert_file):
+ self._key_file = key_file
+ self._cert_file = cert_file
+ def __call__(self, hostport):
+ return httplib.HTTPSConnection(
+ hostport,
+ key_file=self._key_file, cert_file=self._cert_file)
+
+ class HTTPSHandler(AbstractHTTPHandler):
+ def __init__(self, client_cert_manager=None):
+ AbstractHTTPHandler.__init__(self)
+ self.client_cert_manager = client_cert_manager
+
+ def https_open(self, req):
+ if self.client_cert_manager is not None:
+ key_file, cert_file = self.client_cert_manager.find_key_cert(
+ req.get_full_url())
+ conn_factory = HTTPSConnectionFactory(key_file, cert_file)
+ else:
+ conn_factory = httplib.HTTPSConnection
+ return self.do_open(conn_factory, req)
+
+ https_request = AbstractHTTPHandler.do_request_
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_http.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_lwpcookiejar.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -18,12 +18,12 @@
"""
-import time, re, string, logging
+import time, re, logging
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
from _headersutil import join_header_words, split_header_words
-from _util import startswith, iso2time, time2isoz
+from _util import iso2time, time2isoz
debug = logging.getLogger("mechanize").debug
@@ -89,7 +89,7 @@
debug(" Not saving %s: expired", cookie.name)
continue
r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
- return string.join(r+[""], "\n")
+ return "\n".join(r+[""])
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
if filename is None:
@@ -127,9 +127,9 @@
while 1:
line = f.readline()
if line == "": break
- if not startswith(line, header):
+ if not line.startswith(header):
continue
- line = string.strip(line[len(header):])
+ line = line[len(header):].strip()
for data in split_header_words([line]):
name, value = data[0]
@@ -139,7 +139,7 @@
standard[k] = False
for k, v in data[1:]:
if k is not None:
- lc = string.lower(k)
+ lc = k.lower()
else:
lc = None
# don't lose case distinction for unknown fields
@@ -161,7 +161,7 @@
if expires is None:
discard = True
domain = h("domain")
- domain_specified = startswith(domain, ".")
+ domain_specified = domain.startswith(".")
c = Cookie(h("version"), name, value,
h("port"), h("port_spec"),
domain, domain_specified, h("domain_dot"),
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_mechanize.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,14 +9,15 @@
"""
-import urllib2, urlparse, sys, copy, re
+import urllib2, sys, copy, re
-from _useragent import UserAgent
+from _useragent import UserAgentBase
from _html import DefaultFactory
-from _util import response_seek_wrapper, closeable_response
+import _response
import _request
+import _rfc3986
-__version__ = (0, 1, 2, "b", None) # 0.1.2b
+__version__ = (0, 1, 7, "b", None) # 0.1.7b
class BrowserStateError(Exception): pass
class LinkNotFoundError(Exception): pass
@@ -45,60 +46,28 @@
def clear(self):
del self._history[:]
def close(self):
- """
- If nothing has been added, .close should work.
-
- >>> history = History()
- >>> history.close()
-
- Under some circumstances response can be None, in that case
- this method should not raise an exception.
-
- >>> history.add(None, None)
- >>> history.close()
- """
for request, response in self._history:
if response is not None:
response.close()
del self._history[:]
-# Horrible, but needed, at least until fork urllib2. Even then, may want
-# to preseve urllib2 compatibility.
-def upgrade_response(response):
- # a urllib2 handler constructed the response, i.e. the response is an
- # urllib.addinfourl, instead of a _Util.closeable_response as returned
- # by e.g. mechanize.HTTPHandler
- try:
- code = response.code
- except AttributeError:
- code = None
- try:
- msg = response.msg
- except AttributeError:
- msg = None
- # may have already-.read() data from .seek() cache
- data = None
- get_data = getattr(response, "get_data", None)
- if get_data:
- data = get_data()
+class HTTPRefererProcessor(urllib2.BaseHandler):
+ def http_request(self, request):
+ # See RFC 2616 14.36. The only times we know the source of the
+ # request URI has a URI associated with it are redirect, and
+ # Browser.click() / Browser.submit() / Browser.follow_link().
+ # Otherwise, it's the user's job to add any Referer header before
+ # .open()ing.
+ if hasattr(request, "redirect_dict"):
+ request = self.parent._add_referer_header(
+ request, origin_request=False)
+ return request
- response = closeable_response(
- response.fp, response.info(), response.geturl(), code, msg)
- response = response_seek_wrapper(response)
- if data:
- response.set_data(data)
- return response
-class ResponseUpgradeProcessor(urllib2.BaseHandler):
- # upgrade responses to be .close()able without becoming unusable
- handler_order = 0 # before anything else
- def any_response(self, request, response):
- if not hasattr(response, 'closeable_response'):
- response = upgrade_response(response)
- return response
+ https_request = http_request
-class Browser(UserAgent):
+class Browser(UserAgentBase):
"""Browser-like class with support for history, forms and links.
BrowserStateError is raised whenever the browser is in the wrong state to
@@ -113,10 +82,10 @@
"""
- handler_classes = UserAgent.handler_classes.copy()
- handler_classes["_response_upgrade"] = ResponseUpgradeProcessor
- default_others = copy.copy(UserAgent.default_others)
- default_others.append("_response_upgrade")
+ handler_classes = copy.copy(UserAgentBase.handler_classes)
+ handler_classes["_referer"] = HTTPRefererProcessor
+ default_features = copy.copy(UserAgentBase.default_features)
+ default_features.append("_referer")
def __init__(self,
factory=None,
@@ -128,8 +97,8 @@
Only named arguments should be passed to this constructor.
factory: object implementing the mechanize.Factory interface.
- history: object implementing the mechanize.History interface. Note this
- interface is still experimental and may change in future.
+ history: object implementing the mechanize.History interface. Note
+ this interface is still experimental and may change in future.
request_class: Request class to use. Defaults to mechanize.Request
by default for Pythons older than 2.4, urllib2.Request otherwise.
@@ -142,11 +111,11 @@
constructor, to ensure only one Request class is used.
"""
+ self._handle_referer = True
+
if history is None:
history = History()
self._history = history
- self.request = self._response = None
- self.form = None
if request_class is None:
if not hasattr(urllib2.Request, "add_unredirected_header"):
@@ -160,48 +129,108 @@
self._factory = factory
self.request_class = request_class
- UserAgent.__init__(self) # do this last to avoid __getattr__ problems
+ self.request = None
+ self._set_response(None, False)
+ # do this last to avoid __getattr__ problems
+ UserAgentBase.__init__(self)
+
def close(self):
+ UserAgentBase.close(self)
if self._response is not None:
self._response.close()
- UserAgent.close(self)
if self._history is not None:
self._history.close()
self._history = None
+
+ # make use after .close easy to spot
+ self.form = None
self.request = self._response = None
+ self.request = self.response = self.set_response = None
+ self.geturl = self.reload = self.back = None
+ self.clear_history = self.set_cookie = self.links = self.forms = None
+ self.viewing_html = self.encoding = self.title = None
+ self.select_form = self.click = self.submit = self.click_link = None
+ self.follow_link = self.find_link = None
+ def set_handle_referer(self, handle):
+ """Set whether to add Referer header to each request.
+
+ This base class does not implement this feature (so don't turn this on
+ if you're using this base class directly), but the subclass
+ mechanize.Browser does.
+
+ """
+ self._set_handler("_referer", handle)
+ self._handle_referer = bool(handle)
+
+ def _add_referer_header(self, request, origin_request=True):
+ if self.request is None:
+ return request
+ scheme = request.get_type()
+ original_scheme = self.request.get_type()
+ if scheme not in ["http", "https"]:
+ return request
+ if not origin_request and not self.request.has_header("Referer"):
+ return request
+
+ if (self._handle_referer and
+ original_scheme in ["http", "https"] and
+ not (original_scheme == "https" and scheme != "https")):
+ # strip URL fragment (RFC 2616 14.36)
+ parts = _rfc3986.urlsplit(self.request.get_full_url())
+ parts = parts[:-1]+(None,)
+ referer = _rfc3986.urlunsplit(parts)
+ request.add_unredirected_header("Referer", referer)
+ return request
+
+ def open_novisit(self, url, data=None):
+ """Open a URL without visiting it.
+
+ The browser state (including .request, .response(), history, forms and
+ links) are all left unchanged by calling this function.
+
+ The interface is the same as for .open().
+
+ This is useful for things like fetching images.
+
+ See also .retrieve().
+
+ """
+ return self._mech_open(url, data, visit=False)
+
def open(self, url, data=None):
- if self._response is not None:
- self._response.close()
return self._mech_open(url, data)
- def _mech_open(self, url, data=None, update_history=True):
+ def _mech_open(self, url, data=None, update_history=True, visit=None):
try:
url.get_full_url
except AttributeError:
# string URL -- convert to absolute URL if required
- scheme, netloc = urlparse.urlparse(url)[:2]
- if not scheme:
+ scheme, authority = _rfc3986.urlsplit(url)[:2]
+ if scheme is None:
# relative URL
- assert not netloc, "malformed URL"
if self._response is None:
raise BrowserStateError(
- "can't fetch relative URL: not viewing any document")
- url = urlparse.urljoin(self._response.geturl(), url)
+ "can't fetch relative reference: "
+ "not viewing any document")
+ url = _rfc3986.urljoin(self._response.geturl(), url)
- if self.request is not None and update_history:
- self._history.add(self.request, self._response)
- self._response = None
- # we want self.request to be assigned even if UserAgent.open fails
- self.request = self._request(url, data)
- self._previous_scheme = self.request.get_type()
+ request = self._request(url, data, visit)
+ visit = request.visit
+ if visit is None:
+ visit = True
+ if visit:
+ self._visit_request(request, update_history)
+
success = True
try:
- response = UserAgent.open(self, self.request, data)
+ response = UserAgentBase.open(self, request, data)
except urllib2.HTTPError, error:
success = False
+ if error.fp is None: # not a response
+ raise
response = error
## except (IOError, socket.error, OSError), error:
## # Yes, urllib2 really does raise all these :-((
@@ -214,10 +243,16 @@
## # Python core, a fix would need some backwards-compat. hack to be
## # acceptable.
## raise
- self.set_response(response)
+
+ if visit:
+ self._set_response(response, False)
+ response = copy.copy(self._response)
+ elif response is not None:
+ response = _response.upgrade_response(response)
+
if not success:
- raise error
- return copy.copy(self._response)
+ raise response
+ return response
def __str__(self):
text = []
@@ -241,24 +276,52 @@
return copy.copy(self._response)
def set_response(self, response):
- """Replace current response with (a copy of) response."""
+ """Replace current response with (a copy of) response.
+
+ response may be None.
+
+ This is intended mostly for HTML-preprocessing.
+ """
+ self._set_response(response, True)
+
+ def _set_response(self, response, close_current):
# sanity check, necessary but far from sufficient
- if not (hasattr(response, "info") and hasattr(response, "geturl") and
- hasattr(response, "read")):
+ if not (response is None or
+ (hasattr(response, "info") and hasattr(response, "geturl") and
+ hasattr(response, "read")
+ )
+ ):
raise ValueError("not a response object")
self.form = None
+ if response is not None:
+ response = _response.upgrade_response(response)
+ if close_current and self._response is not None:
+ self._response.close()
+ self._response = response
+ self._factory.set_response(response)
- if not hasattr(response, "seek"):
- response = response_seek_wrapper(response)
- if not hasattr(response, "closeable_response"):
- response = upgrade_response(response)
- else:
- response = copy.copy(response)
+ def visit_response(self, response, request=None):
+ """Visit the response, as if it had been .open()ed.
- self._response = response
- self._factory.set_response(self._response)
+ Unlike .set_response(), this updates history rather than replacing the
+ current response.
+ """
+ if request is None:
+ request = _request.Request(response.geturl())
+ self._visit_request(request, True)
+ self._set_response(response, False)
+ def _visit_request(self, request, update_history):
+ if self._response is not None:
+ self._response.close()
+ if self.request is not None and update_history:
+ self._history.add(self.request, self._response)
+ self._response = None
+ # we want self.request to be assigned even if UserAgentBase.open
+ # fails
+ self.request = request
+
def geturl(self):
"""Get URL of current document."""
if self._response is None:
@@ -283,11 +346,53 @@
self._response.close()
self.request, response = self._history.back(n, self._response)
self.set_response(response)
- return response
+ if not response.read_complete:
+ return self.reload()
+ return copy.copy(response)
def clear_history(self):
self._history.clear()
+ def set_cookie(self, cookie_string):
+ """Request to set a cookie.
+
+ Note that it is NOT necessary to call this method under ordinary
+ circumstances: cookie handling is normally entirely automatic. The
+ intended use case is rather to simulate the setting of a cookie by
+ client script in a web page (e.g. JavaScript). In that case, use of
+ this method is necessary because mechanize currently does not support
+ JavaScript, VBScript, etc.
+
+ The cookie is added in the same way as if it had arrived with the
+ current response, as a result of the current request. This means that,
+ for example, it is not appropriate to set the cookie based on the
+ current request, no cookie will be set.
+
+ The cookie will be returned automatically with subsequent responses
+ made by the Browser instance whenever that's appropriate.
+
+ cookie_string should be a valid value of the Set-Cookie header.
+
+ For example:
+
+ browser.set_cookie(
+ "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
+
+ Currently, this method does not allow for adding RFC 2986 cookies.
+ This limitation will be lifted if anybody requests it.
+
+ """
+ if self._response is None:
+ raise BrowserStateError("not viewing any document")
+ if self.request.get_type() not in ["http", "https"]:
+ raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
+ "transactions")
+ cookiejar = self._ua_handlers["_cookies"].cookiejar
+ response = self.response() # copy
+ headers = response.info()
+ headers["Set-cookie"] = cookie_string
+ cookiejar.extract_cookies(response, self.request)
+
def links(self, **kwds):
"""Return iterable over links (mechanize.Link objects)."""
if not self.viewing_html():
@@ -308,6 +413,24 @@
raise BrowserStateError("not viewing HTML")
return self._factory.forms()
+ def global_form(self):
+ """Return the global form object, or None if the factory implementation
+ did not supply one.
+
+ The "global" form object contains all controls that are not descendants of
+ any FORM element.
+
+ The returned form object implements the ClientForm.HTMLForm interface.
+
+ This is a separate method since the global form is not regarded as part
+ of the sequence of forms in the document -- mostly for
+ backwards-compatibility.
+
+ """
+ if not self.viewing_html():
+ raise BrowserStateError("not viewing HTML")
+ return self._factory.global_form
+
def viewing_html(self):
"""Return whether the current response contains HTML data."""
if self._response is None:
@@ -340,6 +463,10 @@
interface, so you can call methods like .set_value(), .set(), and
.click().
+ Another way to select a form is to assign to the .form attribute. The
+ form assigned should be one of the objects returned by the .forms()
+ method.
+
At least one of the name, predicate and nr arguments must be supplied.
If no matching form is found, mechanize.FormNotFoundError is raised.
@@ -382,26 +509,6 @@
description = ", ".join(description)
raise FormNotFoundError("no form matching "+description)
- def _add_referer_header(self, request, origin_request=True):
- if self.request is None:
- return request
- scheme = request.get_type()
- original_scheme = self.request.get_type()
- if scheme not in ["http", "https"]:
- return request
- if not origin_request and not self.request.has_header("Referer"):
- return request
-
- if (self._handle_referer and
- original_scheme in ["http", "https"] and
- not (original_scheme == "https" and scheme != "https")):
- # strip URL fragment (RFC 2616 14.36)
- parts = urlparse.urlparse(self.request.get_full_url())
- parts = parts[:-1]+("",)
- referer = urlparse.urlunparse(parts)
- request.add_unredirected_header("Referer", referer)
- return request
-
def click(self, *args, **kwds):
"""See ClientForm.HTMLForm.click for documentation."""
if not self.viewing_html():
@@ -507,9 +614,6 @@
".select_form()?)" % (self.__class__, name))
return getattr(form, name)
-#---------------------------------------------------
-# Private methods.
-
def _filter_links(self, links,
text=None, text_regex=None,
name=None, name_regex=None,
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_mozillacookiejar.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,11 +9,10 @@
"""
-import re, string, time, logging
+import re, time, logging
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
-from _util import startswith, endswith
debug = logging.getLogger("ClientCookie").debug
@@ -72,23 +71,23 @@
if line == "": break
# last field may be absent, so keep any trailing tab
- if endswith(line, "\n"): line = line[:-1]
+ if line.endswith("\n"): line = line[:-1]
# skip comments and blank lines XXX what is $ for?
- if (startswith(string.strip(line), "#") or
- startswith(string.strip(line), "$") or
- string.strip(line) == ""):
+ if (line.strip().startswith("#") or
+ line.strip().startswith("$") or
+ line.strip() == ""):
continue
domain, domain_specified, path, secure, expires, name, value = \
- string.split(line, "\t")
+ line.split("\t")
secure = (secure == "TRUE")
domain_specified = (domain_specified == "TRUE")
if name == "":
name = value
value = None
- initial_dot = startswith(domain, ".")
+ initial_dot = domain.startswith(".")
assert domain_specified == initial_dot
discard = False
@@ -137,7 +136,7 @@
continue
if cookie.secure: secure = "TRUE"
else: secure = "FALSE"
- if startswith(cookie.domain, "."): initial_dot = "TRUE"
+ if cookie.domain.startswith("."): initial_dot = "TRUE"
else: initial_dot = "FALSE"
if cookie.expires is not None:
expires = str(cookie.expires)
@@ -153,8 +152,8 @@
name = cookie.name
value = cookie.value
f.write(
- string.join([cookie.domain, initial_dot, cookie.path,
- secure, expires, name, value], "\t")+
+ "\t".join([cookie.domain, initial_dot, cookie.path,
+ secure, expires, name, value])+
"\n")
finally:
f.close()
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_msiecookiejar.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -11,13 +11,12 @@
# XXX names and comments are not great here
-import os, re, string, time, struct, logging
+import os, re, time, struct, logging
if os.name == "nt":
import _winreg
from _clientcookie import FileCookieJar, CookieJar, Cookie, \
MISSING_FILENAME_TEXT, LoadError
-from _util import startswith
debug = logging.getLogger("mechanize").debug
@@ -50,7 +49,7 @@
return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
def binary_to_char(c): return "%02X" % ord(c)
-def binary_to_str(d): return string.join(map(binary_to_char, list(d)), "")
+def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
class MSIEBase:
magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
@@ -153,7 +152,7 @@
else:
discard = False
domain = cookie["DOMAIN"]
- initial_dot = startswith(domain, ".")
+ initial_dot = domain.startswith(".")
if initial_dot:
domain_specified = True
else:
@@ -201,7 +200,7 @@
now = int(time.time())
if username is None:
- username = string.lower(os.environ['USERNAME'])
+ username = os.environ['USERNAME'].lower()
cookie_dir = os.path.dirname(filename)
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_opener.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -9,92 +9,31 @@
"""
-import urllib2, string, bisect, urlparse
-
-from _util import startswith, isstringlike
-from _request import Request
-
+import os, urllib2, bisect, urllib, httplib, types, tempfile
try:
+ import threading as _threading
+except ImportError:
+ import dummy_threading as _threading
+try:
set
except NameError:
import sets
set = sets.Set
-def methnames(obj):
- """Return method names of class instance.
+import _http
+import _upgrade
+import _rfc3986
+import _response
+from _util import isstringlike
+from _request import Request
- dir(obj) doesn't work across Python versions, this does.
- """
- return methnames_of_instance_as_dict(obj).keys()
+class ContentTooShortError(urllib2.URLError):
+ def __init__(self, reason, result):
+ urllib2.URLError.__init__(self, reason)
+ self.result = result
-def methnames_of_instance_as_dict(inst):
- """
- It is possible for an attribute to be present in the results of dir(inst),
- but for getattr(inst, attr_name) to raise an Attribute error, that should
- be handled gracefully.
- >>> class BadAttr(object):
- ... def error(self):
- ... raise AttributeError
- ... error = property(error)
-
- >>> inst = BadAttr()
- >>> 'error' in dir(inst)
- True
- >>> inst.error
- Traceback (most recent call last):
- ...
- AttributeError
-
- >>> result = methnames_of_instance_as_dict(inst) # no exception
- """
- names = {}
- names.update(methnames_of_class_as_dict(inst.__class__))
- for methname in dir(inst):
- try:
- candidate = getattr(inst, methname)
- except AttributeError:
- continue
- if callable(candidate):
- names[methname] = None
- return names
-
-def methnames_of_class_as_dict(klass):
- """
- It is possible for an attribute to be present in the results of dir(inst),
- but for getattr(inst, attr_name) to raise an Attribute error, that should
- be handled gracefully.
-
- >>> class BadClass(object):
- ... def error(self):
- ... raise AttributeError
- ... error = property(error)
- ... __bases__ = []
-
- >>> klass = BadClass()
- >>> 'error' in dir(klass)
- True
- >>> klass.error
- Traceback (most recent call last):
- ...
- AttributeError
-
- >>> result = methnames_of_class_as_dict(klass) # no exception
- """
- names = {}
- for methname in dir(klass):
- try:
- candidate = getattr(klass, methname)
- except AttributeError:
- continue
- if callable(candidate):
- names[methname] = None
- for baseclass in klass.__bases__:
- names.update(methnames_of_class_as_dict(baseclass))
- return names
-
-
class OpenerDirector(urllib2.OpenerDirector):
def __init__(self):
urllib2.OpenerDirector.__init__(self)
@@ -105,6 +44,7 @@
self._any_request = {}
self._any_response = {}
self._handler_index_valid = True
+ self._tempfiles = []
def add_handler(self, handler):
if handler in self.handlers:
@@ -128,7 +68,7 @@
for handler in self.handlers:
added = False
- for meth in methnames(handler):
+ for meth in dir(handler):
if meth in ["redirect_request", "do_open", "proxy_open"]:
# oops, coincidental match
continue
@@ -146,8 +86,8 @@
scheme = meth[:ii]
condition = meth[ii+1:]
- if startswith(condition, "error"):
- jj = string.find(meth[ii+1:], "_") + ii + 1
+ if condition.startswith("error"):
+ jj = meth[ii+1:].find("_") + ii + 1
kind = meth[jj+1:]
try:
kind = int(kind)
@@ -198,18 +138,25 @@
self._any_request = any_request
self._any_response = any_response
- def _request(self, url_or_req, data):
+ def _request(self, url_or_req, data, visit):
if isstringlike(url_or_req):
- req = Request(url_or_req, data)
+ req = Request(url_or_req, data, visit=visit)
else:
# already a urllib2.Request or mechanize.Request instance
req = url_or_req
if data is not None:
req.add_data(data)
+ # XXX yuck, give request a .visit attribute if it doesn't have one
+ try:
+ req.visit
+ except AttributeError:
+ req.visit = None
+ if visit is not None:
+ req.visit = visit
return req
def open(self, fullurl, data=None):
- req = self._request(fullurl, data)
+ req = self._request(fullurl, data, None)
req_scheme = req.get_type()
self._maybe_reindex_handlers()
@@ -267,48 +214,208 @@
args = (dict, 'default', 'http_error_default') + orig_args
return apply(self._call_chain, args)
+ BLOCK_SIZE = 1024*8
def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
"""Returns (filename, headers).
For remote objects, the default filename will refer to a temporary
- file.
+ file. Temporary files are removed when the OpenerDirector.close()
+ method is called.
+ For file: URLs, at present the returned filename is None. This may
+ change in future.
+
+ If the actual number of bytes read is less than indicated by the
+ Content-Length header, raises ContentTooShortError (a URLError
+ subclass). The exception's .result attribute contains the (filename,
+ headers) that would have been returned.
+
"""
- req = self._request(fullurl, data)
- type_ = req.get_type()
+ req = self._request(fullurl, data, False)
+ scheme = req.get_type()
fp = self.open(req)
headers = fp.info()
- if filename is None and type == 'file':
- return url2pathname(req.get_selector()), headers
+ if filename is None and scheme == 'file':
+ # XXX req.get_selector() seems broken here, return None,
+ # pending sanity :-/
+ return None, headers
+ #return urllib.url2pathname(req.get_selector()), headers
if filename:
tfp = open(filename, 'wb')
else:
- path = urlparse(fullurl)[2]
+ path = _rfc3986.urlsplit(fullurl)[2]
suffix = os.path.splitext(path)[1]
- tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+ fd, filename = tempfile.mkstemp(suffix)
+ self._tempfiles.append(filename)
+ tfp = os.fdopen(fd, 'wb')
+
result = filename, headers
- bs = 1024*8
+ bs = self.BLOCK_SIZE
size = -1
read = 0
- blocknum = 1
+ blocknum = 0
if reporthook:
- if headers.has_key("content-length"):
+ if "content-length" in headers:
size = int(headers["Content-Length"])
- reporthook(0, bs, size)
+ reporthook(blocknum, bs, size)
while 1:
block = fp.read(bs)
+ if block == "":
+ break
read += len(block)
+ tfp.write(block)
+ blocknum += 1
if reporthook:
reporthook(blocknum, bs, size)
- blocknum = blocknum + 1
- if not block:
- break
- tfp.write(block)
fp.close()
tfp.close()
del fp
del tfp
- if size>=0 and read<size:
- raise IOError("incomplete retrieval error",
- "got only %d bytes out of %d" % (read,size))
+
+ # raise exception if actual size does not match content-length header
+ if size >= 0 and read < size:
+ raise ContentTooShortError(
+ "retrieval incomplete: "
+ "got only %i out of %i bytes" % (read, size),
+ result
+ )
+
return result
+
+ def close(self):
+ urllib2.OpenerDirector.close(self)
+
+ # make it very obvious this object is no longer supposed to be used
+ self.open = self.error = self.retrieve = self.add_handler = None
+
+ if self._tempfiles:
+ for filename in self._tempfiles:
+ try:
+ os.unlink(filename)
+ except OSError:
+ pass
+ del self._tempfiles[:]
+
+
+def wrapped_open(urlopen, process_response_object, fullurl, data=None):
+ success = True
+ try:
+ response = urlopen(fullurl, data)
+ except urllib2.HTTPError, error:
+ success = False
+ if error.fp is None: # not a response
+ raise
+ response = error
+
+ if response is not None:
+ response = process_response_object(response)
+
+ if not success:
+ raise response
+ return response
+
+class ResponseProcessingOpener(OpenerDirector):
+
+ def open(self, fullurl, data=None):
+ def bound_open(fullurl, data=None):
+ return OpenerDirector.open(self, fullurl, data)
+ return wrapped_open(
+ bound_open, self.process_response_object, fullurl, data)
+
+ def process_response_object(self, response):
+ return response
+
+
+class SeekableResponseOpener(ResponseProcessingOpener):
+ def process_response_object(self, response):
+ return _response.seek_wrapped_response(response)
+
+
+class OpenerFactory:
+ """This class's interface is quite likely to change."""
+
+ default_classes = [
+ # handlers
+ urllib2.ProxyHandler,
+ urllib2.UnknownHandler,
+ _http.HTTPHandler, # derived from new AbstractHTTPHandler
+ _http.HTTPDefaultErrorHandler,
+ _http.HTTPRedirectHandler, # bugfixed
+ urllib2.FTPHandler,
+ urllib2.FileHandler,
+ # processors
+ _upgrade.HTTPRequestUpgradeProcessor,
+ _http.HTTPCookieProcessor,
+ _http.HTTPErrorProcessor,
+ ]
+ if hasattr(httplib, 'HTTPS'):
+ default_classes.append(_http.HTTPSHandler)
+ handlers = []
+ replacement_handlers = []
+
+ def __init__(self, klass=OpenerDirector):
+ self.klass = klass
+
+ def build_opener(self, *handlers):
+ """Create an opener object from a list of handlers and processors.
+
+ The opener will use several default handlers and processors, including
+ support for HTTP and FTP.
+
+ If any of the handlers passed as arguments are subclasses of the
+ default handlers, the default handlers will not be used.
+
+ """
+ opener = self.klass()
+ default_classes = list(self.default_classes)
+ skip = []
+ for klass in default_classes:
+ for check in handlers:
+ if type(check) == types.ClassType:
+ if issubclass(check, klass):
+ skip.append(klass)
+ elif type(check) == types.InstanceType:
+ if isinstance(check, klass):
+ skip.append(klass)
+ for klass in skip:
+ default_classes.remove(klass)
+
+ for klass in default_classes:
+ opener.add_handler(klass())
+ for h in handlers:
+ if type(h) == types.ClassType:
+ h = h()
+ opener.add_handler(h)
+
+ return opener
+
+
+build_opener = OpenerFactory().build_opener
+
+_opener = None
+urlopen_lock = _threading.Lock()
+def urlopen(url, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.open(url, data)
+
+def urlretrieve(url, filename=None, reporthook=None, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.retrieve(url, filename, reporthook, data)
+
+def install_opener(opener):
+ global _opener
+ _opener = opener
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_request.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -8,16 +8,33 @@
"""
-import urllib2, string
+import urllib2, urllib, logging
from _clientcookie import request_host
+import _rfc3986
+warn = logging.getLogger("mechanize").warning
+# don't complain about missing logging handler
+logging.getLogger("mechanize").setLevel(logging.ERROR)
+
class Request(urllib2.Request):
def __init__(self, url, data=None, headers={},
- origin_req_host=None, unverifiable=False):
+ origin_req_host=None, unverifiable=False, visit=None):
+ # In mechanize 0.2, the interpretation of a unicode url argument will
+ # change: A unicode url argument will be interpreted as an IRI, and a
+ # bytestring as a URI. For now, we accept unicode or bytestring. We
+ # don't insist that the value is always a URI (specifically, must only
+ # contain characters which are legal), because that might break working
+ # code (who knows what bytes some servers want to see, especially with
+ # browser plugins for internationalised URIs).
+ if not _rfc3986.is_clean_uri(url):
+ warn("url argument is not a URI "
+ "(contains illegal characters) %r" % url)
urllib2.Request.__init__(self, url, data, headers)
+ self.selector = None
self.unredirected_hdrs = {}
+ self.visit = visit
# All the terminology below comes from RFC 2965.
self.unverifiable = unverifiable
@@ -31,6 +48,9 @@
origin_req_host = request_host(self)
self.origin_req_host = origin_req_host
+ def get_selector(self):
+ return urllib.splittag(self.__r_host)[0]
+
def get_origin_req_host(self):
return self.origin_req_host
@@ -39,14 +59,12 @@
def add_unredirected_header(self, key, val):
"""Add a header that will not be added to a redirected request."""
- self.unredirected_hdrs[string.capitalize(key)] = val
+ self.unredirected_hdrs[key.capitalize()] = val
def has_header(self, header_name):
"""True iff request has named header (regular or unredirected)."""
- if (self.headers.has_key(header_name) or
- self.unredirected_hdrs.has_key(header_name)):
- return True
- return False
+ return (header_name in self.headers or
+ header_name in self.unredirected_hdrs)
def get_header(self, header_name, default=None):
return self.headers.get(
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,514 @@
+"""Response classes.
+
+The seek_wrapper code is not used if you're using UserAgent with
+.set_seekable_responses(False), or if you're using the urllib2-level interface
+without SeekableProcessor or HTTPEquivProcessor. Class closeable_response is
+instantiated by some handlers (AbstractHTTPHandler), but the closeable_response
+interface is only depended upon by Browser-level code. Function
+upgrade_response is only used if you're using Browser or
+ResponseUpgradeProcessor.
+
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+import copy, mimetools
+from cStringIO import StringIO
+import urllib2
+
+# XXX Andrew Dalke kindly sent me a similar class in response to my request on
+# comp.lang.python, which I then proceeded to lose. I wrote this class
+# instead, but I think he's released his code publicly since, could pinch the
+# tests from it, at least...
+
+# For testing seek_wrapper invariant (note that
+# test_urllib2.HandlerTest.test_seekable is expected to fail when this
+# invariant checking is turned on). The invariant checking is done by module
+# ipdc, which is available here:
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
+## from ipdbc import ContractBase
+## class seek_wrapper(ContractBase):
+class seek_wrapper:
+ """Adds a seek method to a file object.
+
+ This is only designed for seeking on readonly file-like objects.
+
+ Wrapped file-like object must have a read method. The readline method is
+ only supported if that method is present on the wrapped object. The
+ readlines method is always supported. xreadlines and iteration are
+ supported only for Python 2.2 and above.
+
+ Public attributes:
+
+ wrapped: the wrapped file object
+ is_closed: true iff .close() has been called
+
+ WARNING: All other attributes of the wrapped object (ie. those that are not
+ one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
+ are passed through unaltered, which may or may not make sense for your
+ particular file object.
+
+ """
+ # General strategy is to check that cache is full enough, then delegate to
+ # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
+ # position (self.__pos) is maintained independently of the cache, in order
+ # that a single cache may be shared between multiple seek_wrapper objects.
+ # Copying using module copy shares the cache in this way.
+
+ def __init__(self, wrapped):
+ self.wrapped = wrapped
+ self.__read_complete_state = [False]
+ self.__is_closed_state = [False]
+ self.__have_readline = hasattr(self.wrapped, "readline")
+ self.__cache = StringIO()
+ self.__pos = 0 # seek position
+
+ def invariant(self):
+ # The end of the cache is always at the same place as the end of the
+ # wrapped file.
+ return self.wrapped.tell() == len(self.__cache.getvalue())
+
+ def close(self):
+ self.wrapped.close()
+ self.is_closed = True
+
+ def __getattr__(self, name):
+ if name == "is_closed":
+ return self.__is_closed_state[0]
+ elif name == "read_complete":
+ return self.__read_complete_state[0]
+
+ wrapped = self.__dict__.get("wrapped")
+ if wrapped:
+ return getattr(wrapped, name)
+
+ return getattr(self.__class__, name)
+
+ def __setattr__(self, name, value):
+ if name == "is_closed":
+ self.__is_closed_state[0] = bool(value)
+ elif name == "read_complete":
+ if not self.is_closed:
+ self.__read_complete_state[0] = bool(value)
+ else:
+ self.__dict__[name] = value
+
+ def seek(self, offset, whence=0):
+ assert whence in [0,1,2]
+
+ # how much data, if any, do we need to read?
+ if whence == 2: # 2: relative to end of *wrapped* file
+ if offset < 0: raise ValueError("negative seek offset")
+ # since we don't know yet where the end of that file is, we must
+ # read everything
+ to_read = None
+ else:
+ if whence == 0: # 0: absolute
+ if offset < 0: raise ValueError("negative seek offset")
+ dest = offset
+ else: # 1: relative to current position
+ pos = self.__pos
+ if pos < offset:
+ raise ValueError("seek to before start of file")
+ dest = pos + offset
+ end = len(self.__cache.getvalue())
+ to_read = dest - end
+ if to_read < 0:
+ to_read = 0
+
+ if to_read != 0:
+ self.__cache.seek(0, 2)
+ if to_read is None:
+ assert whence == 2
+ self.__cache.write(self.wrapped.read())
+ self.read_complete = True
+ self.__pos = self.__cache.tell() - offset
+ else:
+ data = self.wrapped.read(to_read)
+ if not data:
+ self.read_complete = True
+ else:
+ self.__cache.write(data)
+ # Don't raise an exception even if we've seek()ed past the end
+ # of .wrapped, since fseek() doesn't complain in that case.
+ # Also like fseek(), pretend we have seek()ed past the end,
+ # i.e. not:
+ #self.__pos = self.__cache.tell()
+ # but rather:
+ self.__pos = dest
+ else:
+ self.__pos = dest
+
+ def tell(self):
+ return self.__pos
+
+ def __copy__(self):
+ cpy = self.__class__(self.wrapped)
+ cpy.__cache = self.__cache
+ cpy.__read_complete_state = self.__read_complete_state
+ cpy.__is_closed_state = self.__is_closed_state
+ return cpy
+
+ def get_data(self):
+ pos = self.__pos
+ try:
+ self.seek(0)
+ return self.read(-1)
+ finally:
+ self.__pos = pos
+
+ def read(self, size=-1):
+ pos = self.__pos
+ end = len(self.__cache.getvalue())
+ available = end - pos
+
+ # enough data already cached?
+ if size <= available and size != -1:
+ self.__cache.seek(pos)
+ self.__pos = pos+size
+ return self.__cache.read(size)
+
+ # no, so read sufficient data from wrapped file and cache it
+ self.__cache.seek(0, 2)
+ if size == -1:
+ self.__cache.write(self.wrapped.read())
+ self.read_complete = True
+ else:
+ to_read = size - available
+ assert to_read > 0
+ data = self.wrapped.read(to_read)
+ if not data:
+ self.read_complete = True
+ else:
+ self.__cache.write(data)
+ self.__cache.seek(pos)
+
+ data = self.__cache.read(size)
+ self.__pos = self.__cache.tell()
+ assert self.__pos == pos + len(data)
+ return data
+
+ def readline(self, size=-1):
+ if not self.__have_readline:
+ raise NotImplementedError("no readline method on wrapped object")
+
+ # line we're about to read might not be complete in the cache, so
+ # read another line first
+ pos = self.__pos
+ self.__cache.seek(0, 2)
+ data = self.wrapped.readline()
+ if not data:
+ self.read_complete = True
+ else:
+ self.__cache.write(data)
+ self.__cache.seek(pos)
+
+ data = self.__cache.readline()
+ if size != -1:
+ r = data[:size]
+ self.__pos = pos+size
+ else:
+ r = data
+ self.__pos = pos+len(data)
+ return r
+
+ def readlines(self, sizehint=-1):
+ pos = self.__pos
+ self.__cache.seek(0, 2)
+ self.__cache.write(self.wrapped.read())
+ self.read_complete = True
+ self.__cache.seek(pos)
+ data = self.__cache.readlines(sizehint)
+ self.__pos = self.__cache.tell()
+ return data
+
+ def __iter__(self): return self
+ def next(self):
+ line = self.readline()
+ if line == "": raise StopIteration
+ return line
+
+ xreadlines = __iter__
+
+ def __repr__(self):
+ return ("<%s at %s whose wrapped object = %r>" %
+ (self.__class__.__name__, hex(abs(id(self))), self.wrapped))
+
+
+class response_seek_wrapper(seek_wrapper):
+
+ """
+ Supports copying response objects and setting response body data.
+
+ """
+
+ def __init__(self, wrapped):
+ seek_wrapper.__init__(self, wrapped)
+ self._headers = self.wrapped.info()
+
+ def __copy__(self):
+ cpy = seek_wrapper.__copy__(self)
+ # copy headers from delegate
+ cpy._headers = copy.copy(self.info())
+ return cpy
+
+ # Note that .info() and .geturl() (the only two urllib2 response methods
+ # that are not implemented by seek_wrapper) must be here explicitly rather
+ # than by seek_wrapper's __getattr__ delegation) so that the nasty
+ # dynamically-created HTTPError classes in get_seek_wrapper_class() get the
+ # wrapped object's implementation, and not HTTPError's.
+
+ def info(self):
+ return self._headers
+
+ def geturl(self):
+ return self.wrapped.geturl()
+
+ def set_data(self, data):
+ self.seek(0)
+ self.read()
+ self.close()
+ cache = self._seek_wrapper__cache = StringIO()
+ cache.write(data)
+ self.seek(0)
+
+
+class eoffile:
+ # file-like object that always claims to be at end-of-file...
+ def read(self, size=-1): return ""
+ def readline(self, size=-1): return ""
+ def __iter__(self): return self
+ def next(self): return ""
+ def close(self): pass
+
+class eofresponse(eoffile):
+ def __init__(self, url, headers, code, msg):
+ self._url = url
+ self._headers = headers
+ self.code = code
+ self.msg = msg
+ def geturl(self): return self._url
+ def info(self): return self._headers
+
+
+class closeable_response:
+ """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
+
+ Only supports responses returned by mechanize.HTTPHandler.
+
+ After .close(), the following methods are supported:
+
+ .read()
+ .readline()
+ .info()
+ .geturl()
+ .__iter__()
+ .next()
+ .close()
+
+ and the following attributes are supported:
+
+ .code
+ .msg
+
+ Also supports pickling (but the stdlib currently does something to prevent
+ it: http://python.org/sf/1144636).
+
+ """
+ # presence of this attr indicates is useable after .close()
+ closeable_response = None
+
+ def __init__(self, fp, headers, url, code, msg):
+ self._set_fp(fp)
+ self._headers = headers
+ self._url = url
+ self.code = code
+ self.msg = msg
+
+ def _set_fp(self, fp):
+ self.fp = fp
+ self.read = self.fp.read
+ self.readline = self.fp.readline
+ if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
+ if hasattr(self.fp, "fileno"):
+ self.fileno = self.fp.fileno
+ else:
+ self.fileno = lambda: None
+ self.__iter__ = self.fp.__iter__
+ self.next = self.fp.next
+
+ def __repr__(self):
+ return '<%s at %s whose fp = %r>' % (
+ self.__class__.__name__, hex(abs(id(self))), self.fp)
+
+ def info(self):
+ return self._headers
+
+ def geturl(self):
+ return self._url
+
+ def close(self):
+ wrapped = self.fp
+ wrapped.close()
+ new_wrapped = eofresponse(
+ self._url, self._headers, self.code, self.msg)
+ self._set_fp(new_wrapped)
+
+ def __getstate__(self):
+ # There are three obvious options here:
+ # 1. truncate
+ # 2. read to end
+ # 3. close socket, pickle state including read position, then open
+ # again on unpickle and use Range header
+ # XXXX um, 4. refuse to pickle unless .close()d. This is better,
+ # actually ("errors should never pass silently"). Pickling doesn't
+ # work anyway ATM, because of http://python.org/sf/1144636 so fix
+ # this later
+
+ # 2 breaks pickle protocol, because one expects the original object
+ # to be left unscathed by pickling. 3 is too complicated and
+ # surprising (and too much work ;-) to happen in a sane __getstate__.
+ # So we do 1.
+
+ state = self.__dict__.copy()
+ new_wrapped = eofresponse(
+ self._url, self._headers, self.code, self.msg)
+ state["wrapped"] = new_wrapped
+ return state
+
+def test_response(data='test data', headers=[],
+ url="http://example.com/", code=200, msg="OK"):
+ return make_response(data, headers, url, code, msg)
+
+def test_html_response(data='test data', headers=[],
+ url="http://example.com/", code=200, msg="OK"):
+ headers += [("Content-type", "text/html")]
+ return make_response(data, headers, url, code, msg)
+
+def make_response(data, headers, url, code, msg):
+ """Convenient factory for objects implementing response interface.
+
+ data: string containing response body data
+ headers: sequence of (name, value) pairs
+ url: URL of response
+ code: integer response code (e.g. 200)
+ msg: string response code message (e.g. "OK")
+
+ """
+ mime_headers = make_headers(headers)
+ r = closeable_response(StringIO(data), mime_headers, url, code, msg)
+ return response_seek_wrapper(r)
+
+
+def make_headers(headers):
+ """
+ headers: sequence of (name, value) pairs
+ """
+ hdr_text = []
+ for name_value in headers:
+ hdr_text.append("%s: %s" % name_value)
+ return mimetools.Message(StringIO("\n".join(hdr_text)))
+
+
+# Rest of this module is especially horrible, but needed, at least until fork
+# urllib2. Even then, may want to preseve urllib2 compatibility.
+
+def get_seek_wrapper_class(response):
+ # in order to wrap response objects that are also exceptions, we must
+ # dynamically subclass the exception :-(((
+ if (isinstance(response, urllib2.HTTPError) and
+ not hasattr(response, "seek")):
+ if response.__class__.__module__ == "__builtin__":
+ exc_class_name = response.__class__.__name__
+ else:
+ exc_class_name = "%s.%s" % (
+ response.__class__.__module__, response.__class__.__name__)
+
+ class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
+ # this only derives from HTTPError in order to be a subclass --
+ # the HTTPError behaviour comes from delegation
+
+ _exc_class_name = exc_class_name
+
+ def __init__(self, wrapped):
+ response_seek_wrapper.__init__(self, wrapped)
+ # be compatible with undocumented HTTPError attributes :-(
+ self.hdrs = wrapped.info()
+ self.filename = wrapped.geturl()
+
+ def __repr__(self):
+ return (
+ "<%s (%s instance) at %s "
+ "whose wrapped object = %r>" % (
+ self.__class__.__name__, self._exc_class_name,
+ hex(abs(id(self))), self.wrapped)
+ )
+ wrapper_class = httperror_seek_wrapper
+ else:
+ wrapper_class = response_seek_wrapper
+ return wrapper_class
+
+def seek_wrapped_response(response):
+ """Return a copy of response that supports seekable response interface.
+
+ Accepts responses from both mechanize and urllib2 handlers.
+
+ Copes with both oridinary response instances and HTTPError instances (which
+ can't be simply wrapped due to the requirement of preserving the exception
+ base class).
+ """
+ if not hasattr(response, "seek"):
+ wrapper_class = get_seek_wrapper_class(response)
+ response = wrapper_class(response)
+ assert hasattr(response, "get_data")
+ return response
+
+def upgrade_response(response):
+ """Return a copy of response that supports Browser response interface.
+
+ Browser response interface is that of "seekable responses"
+ (response_seek_wrapper), plus the requirement that responses must be
+ useable after .close() (closeable_response).
+
+ Accepts responses from both mechanize and urllib2 handlers.
+
+ Copes with both ordinary response instances and HTTPError instances (which
+ can't be simply wrapped due to the requirement of preserving the exception
+ base class).
+ """
+ wrapper_class = get_seek_wrapper_class(response)
+ if hasattr(response, "closeable_response"):
+ if not hasattr(response, "seek"):
+ response = wrapper_class(response)
+ assert hasattr(response, "get_data")
+ return copy.copy(response)
+
+ # a urllib2 handler constructed the response, i.e. the response is an
+ # urllib.addinfourl or a urllib2.HTTPError, instead of a
+ # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
+ try:
+ code = response.code
+ except AttributeError:
+ code = None
+ try:
+ msg = response.msg
+ except AttributeError:
+ msg = None
+
+ # may have already-.read() data from .seek() cache
+ data = None
+ get_data = getattr(response, "get_data", None)
+ if get_data:
+ data = get_data()
+
+ response = closeable_response(
+ response.fp, response.info(), response.geturl(), code, msg)
+ response = wrapper_class(response)
+ if data:
+ response.set_data(data)
+ return response
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_response.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,240 @@
+"""RFC 3986 URI parsing and relative reference resolution / absolutization.
+
+(aka splitting and joining)
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
+
+import sys, re, posixpath, urllib
+
+## def chr_range(a, b):
+## return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+## "abcdefghijklmnopqrstuvwxyz"
+## "0123456789"
+## "-_.~")
+## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
+## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
+# this re matches any character that's not in URI_CHARS
+BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
+
+
+def clean_url(url, encoding):
+ # percent-encode illegal URI characters
+ # Trying to come up with test cases for this gave me a headache, revisit
+ # when do switch to unicode.
+ # Somebody else's comments (lost the attribution):
+## - IE will return you the url in the encoding you send it
+## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
+## characters in your link. It will send you utf-8 however if there are...
+ if type(url) == type(""):
+ url = url.decode(encoding, "replace")
+ url = url.strip()
+ # for second param to urllib.quote(), we want URI_CHARS, minus the
+ # 'always_safe' characters that urllib.quote() never percent-encodes
+ return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
+
+def is_clean_uri(uri):
+ """
+ >>> is_clean_uri("ABC!")
+ True
+ >>> is_clean_uri(u"ABC!")
+ True
+ >>> is_clean_uri("ABC|")
+ False
+ >>> is_clean_uri(u"ABC|")
+ False
+ >>> is_clean_uri("http://example.com/0")
+ True
+ >>> is_clean_uri(u"http://example.com/0")
+ True
+ """
+ # note module re treats bytestrings as through they were decoded as latin-1
+ # so this function accepts both unicode and bytestrings
+ return not bool(BAD_URI_CHARS_RE.search(uri))
+
+
+SPLIT_MATCH = re.compile(
+ r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
+def urlsplit(absolute_uri):
+ """Return scheme, authority, path, query, fragment."""
+ match = SPLIT_MATCH(absolute_uri)
+ if match:
+ g = match.groups()
+ return g[1], g[3], g[4], g[6], g[8]
+
+def urlunsplit(parts):
+ scheme, authority, path, query, fragment = parts
+ r = []
+ append = r.append
+ if scheme is not None:
+ append(scheme)
+ append(":")
+ if authority is not None:
+ append("//")
+ append(authority)
+ append(path)
+ if query is not None:
+ append("?")
+ append(query)
+ if fragment is not None:
+ append("#")
+ append(fragment)
+ return "".join(r)
+
+def urljoin(base_uri, uri_reference):
+ return urlunsplit(urljoin_parts(urlsplit(base_uri),
+ urlsplit(uri_reference)))
+
+# oops, this doesn't do the same thing as the literal translation
+# from the RFC below
+## def urljoin_parts(base_parts, reference_parts):
+## scheme, authority, path, query, fragment = base_parts
+## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+## # compute target URI path
+## if rpath == "":
+## tpath = path
+## else:
+## tpath = rpath
+## if not tpath.startswith("/"):
+## tpath = merge(authority, path, tpath)
+## tpath = posixpath.normpath(tpath)
+
+## if rscheme is not None:
+## return (rscheme, rauthority, tpath, rquery, rfragment)
+## elif rauthority is not None:
+## return (scheme, rauthority, tpath, rquery, rfragment)
+## elif rpath == "":
+## if rquery is not None:
+## tquery = rquery
+## else:
+## tquery = query
+## return (scheme, authority, tpath, tquery, rfragment)
+## else:
+## return (scheme, authority, tpath, rquery, rfragment)
+
+def urljoin_parts(base_parts, reference_parts):
+ scheme, authority, path, query, fragment = base_parts
+ rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+ if rscheme == scheme:
+ rscheme = None
+
+ if rscheme is not None:
+ tscheme, tauthority, tpath, tquery = (
+ rscheme, rauthority, remove_dot_segments(rpath), rquery)
+ else:
+ if rauthority is not None:
+ tauthority, tpath, tquery = (
+ rauthority, remove_dot_segments(rpath), rquery)
+ else:
+ if rpath == "":
+ tpath = path
+ if rquery is not None:
+ tquery = rquery
+ else:
+ tquery = query
+ else:
+ if rpath.startswith("/"):
+ tpath = remove_dot_segments(rpath)
+ else:
+ tpath = merge(authority, path, rpath)
+ tpath = remove_dot_segments(tpath)
+ tquery = rquery
+ tauthority = authority
+ tscheme = scheme
+ tfragment = rfragment
+ return (tscheme, tauthority, tpath, tquery, tfragment)
+
+# um, something *vaguely* like this is what I want, but I have to generate
+# lots of test cases first, if only to understand what it is that
+# remove_dot_segments really does...
+## def remove_dot_segments(path):
+## if path == '':
+## return ''
+## comps = path.split('/')
+## new_comps = []
+## for comp in comps:
+## if comp in ['.', '']:
+## if not new_comps or new_comps[-1]:
+## new_comps.append('')
+## continue
+## if comp != '..':
+## new_comps.append(comp)
+## elif new_comps:
+## new_comps.pop()
+## return '/'.join(new_comps)
+
+
+def remove_dot_segments(path):
+ r = []
+ while path:
+ # A
+ if path.startswith("../"):
+ path = path[3:]
+ continue
+ if path.startswith("./"):
+ path = path[2:]
+ continue
+ # B
+ if path.startswith("/./"):
+ path = path[2:]
+ continue
+ if path == "/.":
+ path = "/"
+ continue
+ # C
+ if path.startswith("/../"):
+ path = path[3:]
+ if r:
+ r.pop()
+ continue
+ if path == "/..":
+ path = "/"
+ if r:
+ r.pop()
+ continue
+ # D
+ if path == ".":
+ path = path[1:]
+ continue
+ if path == "..":
+ path = path[2:]
+ continue
+ # E
+ start = 0
+ if path.startswith("/"):
+ start = 1
+ ii = path.find("/", start)
+ if ii < 0:
+ ii = None
+ r.append(path[:ii])
+ if ii is None:
+ break
+ path = path[ii:]
+ return "".join(r)
+
+def merge(base_authority, base_path, ref_path):
+ # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
+ # doesn't even take base_authority as a parameter, despite the wording in
+ # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
+ #if base_authority is not None and base_path == "":
+ if base_path == "":
+ return "/" + ref_path
+ ii = base_path.rfind("/")
+ if ii >= 0:
+ return base_path[:ii+1] + ref_path
+ return ref_path
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_rfc3986.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,16 @@
+from urllib2 import BaseHandler
+from _util import deprecation
+from _response import response_seek_wrapper
+
+
+class SeekableProcessor(BaseHandler):
+ """Deprecated: Make responses seekable."""
+
+ def __init__(self):
+ deprecation(
+ "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
+
+ def any_response(self, request, response):
+ if not hasattr(response, "seek"):
+ return response_seek_wrapper(response)
+ return response
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_seek.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Added: Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py (rev 0)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -0,0 +1,40 @@
+from urllib2 import BaseHandler
+
+from _request import Request
+from _response import upgrade_response
+from _util import deprecation
+
+
+class HTTPRequestUpgradeProcessor(BaseHandler):
+ # upgrade urllib2.Request to this module's Request
+ # yuck!
+ handler_order = 0 # before anything else
+
+ def http_request(self, request):
+ if not hasattr(request, "add_unredirected_header"):
+ newrequest = Request(request._Request__original, request.data,
+ request.headers)
+ try: newrequest.origin_req_host = request.origin_req_host
+ except AttributeError: pass
+ try: newrequest.unverifiable = request.unverifiable
+ except AttributeError: pass
+ try: newrequest.visit = request.visit
+ except AttributeError: pass
+ request = newrequest
+ return request
+
+ https_request = http_request
+
+
+class ResponseUpgradeProcessor(BaseHandler):
+ # upgrade responses to be .close()able without becoming unusable
+ handler_order = 0 # before anything else
+
+ def __init__(self):
+ deprecation(
+ "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable")
+
+ def any_response(self, request, response):
+ if not hasattr(response, 'closeable_response'):
+ response = upgrade_response(response)
+ return response
Property changes on: Zope3/branches/adamg-mechanize-update/src/mechanize/_upgrade.py
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision
Name: svn:eol-style
+ native
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -3,51 +3,60 @@
from urllib2 import \
URLError, \
HTTPError, \
- GopherError, \
+ GopherError
+# ...and from mechanize
+from _opener import OpenerDirector, \
+ SeekableResponseOpener, \
+ build_opener, install_opener, urlopen
+from _auth import \
HTTPPasswordMgr, \
HTTPPasswordMgrWithDefaultRealm, \
AbstractBasicAuthHandler, \
- AbstractDigestAuthHandler
-# ...and from mechanize
-from _opener import OpenerDirector
-from _auth import \
+ AbstractDigestAuthHandler, \
HTTPProxyPasswordMgr, \
ProxyHandler, \
ProxyBasicAuthHandler, \
ProxyDigestAuthHandler, \
HTTPBasicAuthHandler, \
- HTTPDigestAuthHandler
-from _urllib2_support import \
- Request, \
- build_opener, install_opener, urlopen, \
- OpenerFactory, urlretrieve, \
+ HTTPDigestAuthHandler, \
+ HTTPSClientCertMgr
+from _request import \
+ Request
+from _http import \
RobotExclusionError
# handlers...
# ...from urllib2...
from urllib2 import \
BaseHandler, \
- HTTPDefaultErrorHandler, \
UnknownHandler, \
FTPHandler, \
CacheFTPHandler, \
FileHandler, \
GopherHandler
# ...and from mechanize
-from _urllib2_support import \
+from _http import \
HTTPHandler, \
+ HTTPDefaultErrorHandler, \
HTTPRedirectHandler, \
- HTTPRequestUpgradeProcessor, \
HTTPEquivProcessor, \
- SeekableProcessor, \
HTTPCookieProcessor, \
HTTPRefererProcessor, \
HTTPRefreshProcessor, \
HTTPErrorProcessor, \
+ HTTPRobotRulesProcessor
+from _upgrade import \
+ HTTPRequestUpgradeProcessor, \
+ ResponseUpgradeProcessor
+from _debug import \
HTTPResponseDebugProcessor, \
- HTTPRedirectDebugProcessor, \
- HTTPRobotRulesProcessor
+ HTTPRedirectDebugProcessor
+from _seek import \
+ SeekableProcessor
+# crap ATM
+## from _gzip import \
+## HTTPGzipProcessor
import httplib
if hasattr(httplib, 'HTTPS'):
- from _urllib2_support import HTTPSHandler
+ from _http import HTTPSHandler
del httplib
Deleted: Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_urllib2_support.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,718 +0,0 @@
-"""Integration with Python standard library module urllib2.
-
-Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
-the META HTTP-EQUIV tag contents, and following Refresh header redirects.
-
-Copyright 2002-2006 John J Lee <jjl at pobox.com>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD or ZPL 2.1 licenses (see the file
-COPYING.txt included with the distribution).
-
-"""
-
-import copy, time, tempfile, htmlentitydefs, re, logging, types, \
- string, socket, urlparse, urllib2, urllib, httplib, sgmllib
-from urllib2 import URLError, HTTPError, BaseHandler
-from cStringIO import StringIO
-try:
- import threading as _threading
-except ImportError:
- import dummy_threading as _threading
-
-import _opener
-from _request import Request
-from _util import isstringlike, startswith, \
- getheaders, closeable_response, response_seek_wrapper
-from _html import unescape, unescape_charref
-from _headersutil import is_html
-from _clientcookie import CookieJar, request_host
-
-debug = logging.getLogger("mechanize.cookies").debug
-
-
-CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
-DEFAULT_ENCODING = 'latin-1'
-
-
-# This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
-# (http://www.python.org/sf/549151)
-# 2.2.3 is broken here (my fault!), 2.3 is fixed.
-class HTTPRedirectHandler(BaseHandler):
- # maximum number of redirections to any single URL
- # this is needed because of the state that cookies introduce
- max_repeats = 4
- # maximum total number of redirections (regardless of URL) before
- # assuming we're in a loop
- max_redirections = 10
-
- # Implementation notes:
-
- # To avoid the server sending us into an infinite loop, the request
- # object needs to track what URLs we have already seen. Do this by
- # adding a handler-specific attribute to the Request object. The value
- # of the dict is used to count the number of times the same URL has
- # been visited. This is needed because visiting the same URL twice
- # does not necessarily imply a loop, thanks to state introduced by
- # cookies.
-
- # Always unhandled redirection codes:
- # 300 Multiple Choices: should not handle this here.
- # 304 Not Modified: no need to handle here: only of interest to caches
- # that do conditional GETs
- # 305 Use Proxy: probably not worth dealing with here
- # 306 Unused: what was this for in the previous versions of protocol??
-
- def redirect_request(self, newurl, req, fp, code, msg, headers):
- """Return a Request or None in response to a redirect.
-
- This is called by the http_error_30x methods when a redirection
- response is received. If a redirection should take place, return a
- new Request to allow http_error_30x to perform the redirect;
- otherwise, return None to indicate that an HTTPError should be
- raised.
-
- """
- if code in (301, 302, 303, "refresh") or \
- (code == 307 and not req.has_data()):
- # Strictly (according to RFC 2616), 301 or 302 in response to
- # a POST MUST NOT cause a redirection without confirmation
- # from the user (of urllib2, in this case). In practice,
- # essentially all clients do redirect in this case, so we do
- # the same.
- return Request(newurl,
- headers=req.headers,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- else:
- raise HTTPError(req.get_full_url(), code, msg, headers, fp)
-
- def http_error_302(self, req, fp, code, msg, headers):
- # Some servers (incorrectly) return multiple Location headers
- # (so probably same goes for URI). Use first header.
- if headers.has_key('location'):
- newurl = getheaders(headers, 'location')[0]
- elif headers.has_key('uri'):
- newurl = getheaders(headers, 'uri')[0]
- else:
- return
- newurl = urlparse.urljoin(req.get_full_url(), newurl)
-
- # XXX Probably want to forget about the state of the current
- # request, although that might interact poorly with other
- # handlers that also use handler-specific request attributes
- new = self.redirect_request(newurl, req, fp, code, msg, headers)
- if new is None:
- return
-
- # loop detection
- # .redirect_dict has a key url if url was previously visited.
- if hasattr(req, 'redirect_dict'):
- visited = new.redirect_dict = req.redirect_dict
- if (visited.get(newurl, 0) >= self.max_repeats or
- len(visited) >= self.max_redirections):
- raise HTTPError(req.get_full_url(), code,
- self.inf_msg + msg, headers, fp)
- else:
- visited = new.redirect_dict = req.redirect_dict = {}
- visited[newurl] = visited.get(newurl, 0) + 1
-
- # Don't close the fp until we are sure that we won't use it
- # with HTTPError.
- fp.read()
- fp.close()
-
- return self.parent.open(new)
-
- http_error_301 = http_error_303 = http_error_307 = http_error_302
- http_error_refresh = http_error_302
-
- inf_msg = "The HTTP server returned a redirect error that would " \
- "lead to an infinite loop.\n" \
- "The last 30x error message was:\n"
-
-
-class HTTPRequestUpgradeProcessor(BaseHandler):
- # upgrade urllib2.Request to this module's Request
- # yuck!
- handler_order = 0 # before anything else
-
- def http_request(self, request):
- if not hasattr(request, "add_unredirected_header"):
- newrequest = Request(request._Request__original, request.data,
- request.headers)
- try: newrequest.origin_req_host = request.origin_req_host
- except AttributeError: pass
- try: newrequest.unverifiable = request.unverifiable
- except AttributeError: pass
- request = newrequest
- return request
-
- https_request = http_request
-
-# XXX would self.reset() work, instead of raising this exception?
-class EndOfHeadError(Exception): pass
-class AbstractHeadParser:
- # only these elements are allowed in or before HEAD of document
- head_elems = ("html", "head",
- "title", "base",
- "script", "style", "meta", "link", "object")
- _entitydefs = htmlentitydefs.name2codepoint
- _encoding = DEFAULT_ENCODING
-
- def __init__(self):
- self.http_equiv = []
-
- def start_meta(self, attrs):
- http_equiv = content = None
- for key, value in attrs:
- if key == "http-equiv":
- http_equiv = self.unescape_attr_if_required(value)
- elif key == "content":
- content = self.unescape_attr_if_required(value)
- if http_equiv is not None:
- self.http_equiv.append((http_equiv, content))
-
- def end_head(self):
- raise EndOfHeadError()
-
- def handle_entityref(self, name):
- #debug("%s", name)
- self.handle_data(unescape(
- '&%s;' % name, self._entitydefs, self._encoding))
-
- def handle_charref(self, name):
- #debug("%s", name)
- self.handle_data(unescape_charref(name, self._encoding))
-
- def unescape_attr(self, name):
- #debug("%s", name)
- return unescape(name, self._entitydefs, self._encoding)
-
- def unescape_attrs(self, attrs):
- #debug("%s", attrs)
- escaped_attrs = {}
- for key, val in attrs.items():
- escaped_attrs[key] = self.unescape_attr(val)
- return escaped_attrs
-
- def unknown_entityref(self, ref):
- self.handle_data("&%s;" % ref)
-
- def unknown_charref(self, ref):
- self.handle_data("&#%s;" % ref)
-
-
-try:
- import HTMLParser
-except ImportError:
- pass
-else:
- class XHTMLCompatibleHeadParser(AbstractHeadParser,
- HTMLParser.HTMLParser):
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
- def handle_starttag(self, tag, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- try:
- method = getattr(self, 'start_' + tag)
- except AttributeError:
- try:
- method = getattr(self, 'do_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method(attrs)
- else:
- method(attrs)
-
- def handle_endtag(self, tag):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method()
-
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
-
- def unescape_attr_if_required(self, name):
- return name # HTMLParser.HTMLParser already did it
-
-class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
-
- def _not_called(self):
- assert False
-
- def __init__(self):
- sgmllib.SGMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
- def handle_starttag(self, tag, method, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- if tag == "meta":
- method(attrs)
-
- def unknown_starttag(self, tag, attrs):
- self.handle_starttag(tag, self._not_called, attrs)
-
- def handle_endtag(self, tag, method):
- if tag in self.head_elems:
- method()
- else:
- raise EndOfHeadError()
-
- def unescape_attr_if_required(self, name):
- return self.unescape_attr(name)
-
-def parse_head(fileobj, parser):
- """Return a list of key, value pairs."""
- while 1:
- data = fileobj.read(CHUNK)
- try:
- parser.feed(data)
- except EndOfHeadError:
- break
- if len(data) != CHUNK:
- # this should only happen if there is no HTML body, or if
- # CHUNK is big
- break
- return parser.http_equiv
-
-class HTTPEquivProcessor(BaseHandler):
- """Append META HTTP-EQUIV headers to regular HTTP headers."""
-
- handler_order = 300 # before handlers that look at HTTP headers
-
- def __init__(self, head_parser_class=HeadParser,
- i_want_broken_xhtml_support=False,
- ):
- self.head_parser_class = head_parser_class
- self._allow_xhtml = i_want_broken_xhtml_support
-
- def http_response(self, request, response):
- if not hasattr(response, "seek"):
- response = response_seek_wrapper(response)
- headers = response.info()
- url = response.geturl()
- ct_hdrs = getheaders(response.info(), "content-type")
- if is_html(ct_hdrs, url, self._allow_xhtml):
- try:
- try:
- html_headers = parse_head(response, self.head_parser_class())
- finally:
- response.seek(0)
- except (HTMLParser.HTMLParseError,
- sgmllib.SGMLParseError):
- pass
- else:
- for hdr, val in html_headers:
- # rfc822.Message interprets this as appending, not clobbering
- headers[hdr] = val
- return response
-
- https_response = http_response
-
-class SeekableProcessor(BaseHandler):
- """Make responses seekable."""
-
- def any_response(self, request, response):
- if not hasattr(response, "seek"):
- return response_seek_wrapper(response)
- return response
-
-class HTTPCookieProcessor(BaseHandler):
- """Handle HTTP cookies.
-
- Public attributes:
-
- cookiejar: CookieJar instance
-
- """
- def __init__(self, cookiejar=None):
- if cookiejar is None:
- cookiejar = CookieJar()
- self.cookiejar = cookiejar
-
- def http_request(self, request):
- self.cookiejar.add_cookie_header(request)
- return request
-
- def http_response(self, request, response):
- self.cookiejar.extract_cookies(response, request)
- return response
-
- https_request = http_request
- https_response = http_response
-
-try:
- import robotparser
-except ImportError:
- pass
-else:
- class RobotExclusionError(urllib2.HTTPError):
- def __init__(self, request, *args):
- apply(urllib2.HTTPError.__init__, (self,)+args)
- self.request = request
-
- class HTTPRobotRulesProcessor(BaseHandler):
- # before redirections, after everything else
- handler_order = 800
-
- try:
- from httplib import HTTPMessage
- except:
- from mimetools import Message
- http_response_class = Message
- else:
- http_response_class = HTTPMessage
-
- def __init__(self, rfp_class=robotparser.RobotFileParser):
- self.rfp_class = rfp_class
- self.rfp = None
- self._host = None
-
- def http_request(self, request):
- host = request.get_host()
- scheme = request.get_type()
- if host != self._host:
- self.rfp = self.rfp_class()
- self.rfp.set_url(scheme+"://"+host+"/robots.txt")
- self.rfp.read()
- self._host = host
-
- ua = request.get_header("User-agent", "")
- if self.rfp.can_fetch(ua, request.get_full_url()):
- return request
- else:
- msg = "request disallowed by robots.txt"
- raise RobotExclusionError(
- request,
- request.get_full_url(),
- 403, msg,
- self.http_response_class(StringIO()), StringIO(msg))
-
- https_request = http_request
-
-class HTTPRefererProcessor(BaseHandler):
- """Add Referer header to requests.
-
- This only makes sense if you use each RefererProcessor for a single
- chain of requests only (so, for example, if you use a single
- HTTPRefererProcessor to fetch a series of URLs extracted from a single
- page, this will break).
-
- There's a proper implementation of this in module mechanize.
-
- """
- def __init__(self):
- self.referer = None
-
- def http_request(self, request):
- if ((self.referer is not None) and
- not request.has_header("Referer")):
- request.add_unredirected_header("Referer", self.referer)
- return request
-
- def http_response(self, request, response):
- self.referer = response.geturl()
- return response
-
- https_request = http_request
- https_response = http_response
-
-class HTTPResponseDebugProcessor(BaseHandler):
- handler_order = 900 # before redirections, after everything else
-
- def http_response(self, request, response):
- if not hasattr(response, "seek"):
- response = response_seek_wrapper(response)
- info = getLogger("mechanize.http_responses").info
- try:
- info(response.read())
- finally:
- response.seek(0)
- info("*****************************************************")
- return response
-
- https_response = http_response
-
-class HTTPRedirectDebugProcessor(BaseHandler):
- def http_request(self, request):
- if hasattr(request, "redirect_dict"):
- info = getLogger("mechanize.http_redirects").info
- info("redirecting to %s", request.get_full_url())
- return request
-
-class HTTPRefreshProcessor(BaseHandler):
- """Perform HTTP Refresh redirections.
-
- Note that if a non-200 HTTP code has occurred (for example, a 30x
- redirect), this processor will do nothing.
-
- By default, only zero-time Refresh headers are redirected. Use the
- max_time attribute / constructor argument to allow Refresh with longer
- pauses. Use the honor_time attribute / constructor argument to control
- whether the requested pause is honoured (with a time.sleep()) or
- skipped in favour of immediate redirection.
-
- Public attributes:
-
- max_time: see above
- honor_time: see above
-
- """
- handler_order = 1000
-
- def __init__(self, max_time=0, honor_time=True):
- self.max_time = max_time
- self.honor_time = honor_time
-
- def http_response(self, request, response):
- code, msg, hdrs = response.code, response.msg, response.info()
-
- if code == 200 and hdrs.has_key("refresh"):
- refresh = getheaders(hdrs, "refresh")[0]
- ii = string.find(refresh, ";")
- if ii != -1:
- pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
- jj = string.find(newurl_spec, "=")
- if jj != -1:
- key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
- if key.strip().lower() != "url":
- debug("bad Refresh header: %r" % refresh)
- return response
- else:
- pause, newurl = float(refresh), response.geturl()
- if (self.max_time is None) or (pause <= self.max_time):
- if pause > 1E-3 and self.honor_time:
- time.sleep(pause)
- hdrs["location"] = newurl
- # hardcoded http is NOT a bug
- response = self.parent.error(
- "http", request, response,
- "refresh", msg, hdrs)
-
- return response
-
- https_response = http_response
-
-class HTTPErrorProcessor(BaseHandler):
- """Process HTTP error responses.
-
- The purpose of this handler is to to allow other response processors a
- look-in by removing the call to parent.error() from
- AbstractHTTPHandler.
-
- For non-200 error codes, this just passes the job on to the
- Handler.<proto>_error_<code> methods, via the OpenerDirector.error
- method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
- HTTPError if no other handler handles the error.
-
- """
- handler_order = 1000 # after all other processors
-
- def http_response(self, request, response):
- code, msg, hdrs = response.code, response.msg, response.info()
-
- if code != 200:
- # hardcoded http is NOT a bug
- response = self.parent.error(
- "http", request, response, code, msg, hdrs)
-
- return response
-
- https_response = http_response
-
-
-class AbstractHTTPHandler(BaseHandler):
-
- def __init__(self, debuglevel=0):
- self._debuglevel = debuglevel
-
- def set_http_debuglevel(self, level):
- self._debuglevel = level
-
- def do_request_(self, request):
- host = request.get_host()
- if not host:
- raise URLError('no host given')
-
- if request.has_data(): # POST
- data = request.get_data()
- if not request.has_header('Content-type'):
- request.add_unredirected_header(
- 'Content-type',
- 'application/x-www-form-urlencoded')
-
- scheme, sel = urllib.splittype(request.get_selector())
- sel_host, sel_path = urllib.splithost(sel)
- if not request.has_header('Host'):
- request.add_unredirected_header('Host', sel_host or host)
- for name, value in self.parent.addheaders:
- name = string.capitalize(name)
- if not request.has_header(name):
- request.add_unredirected_header(name, value)
-
- return request
-
- def do_open(self, http_class, req):
- """Return an addinfourl object for the request, using http_class.
-
- http_class must implement the HTTPConnection API from httplib.
- The addinfourl return value is a file-like object. It also
- has methods and attributes including:
- - info(): return a mimetools.Message object for the headers
- - geturl(): return the original request URL
- - code: HTTP status code
- """
- host = req.get_host()
- if not host:
- raise URLError('no host given')
-
- h = http_class(host) # will parse host:port
- h.set_debuglevel(self._debuglevel)
-
- headers = req.headers.copy()
- headers.update(req.unredirected_hdrs)
- # We want to make an HTTP/1.1 request, but the addinfourl
- # class isn't prepared to deal with a persistent connection.
- # It will try to read all remaining data from the socket,
- # which will block while the server waits for the next request.
- # So make sure the connection gets closed after the (only)
- # request.
- headers["Connection"] = "close"
- try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
- r = h.getresponse()
- except socket.error, err: # XXX what error?
- raise URLError(err)
-
- # Pick apart the HTTPResponse object to get the addinfourl
- # object initialized properly.
-
- # Wrap the HTTPResponse object in socket's file object adapter
- # for Windows. That adapter calls recv(), so delegate recv()
- # to read(). This weird wrapping allows the returned object to
- # have readline() and readlines() methods.
-
- # XXX It might be better to extract the read buffering code
- # out of socket._fileobject() and into a base class.
-
- r.recv = r.read
- fp = socket._fileobject(r, 'rb', -1)
-
- resp = closeable_response(fp, r.msg, req.get_full_url(),
- r.status, r.reason)
- return resp
-
-
-class HTTPHandler(AbstractHTTPHandler):
- def http_open(self, req):
- return self.do_open(httplib.HTTPConnection, req)
-
- http_request = AbstractHTTPHandler.do_request_
-
-if hasattr(httplib, 'HTTPS'):
- class HTTPSHandler(AbstractHTTPHandler):
- def https_open(self, req):
- return self.do_open(httplib.HTTPSConnection, req)
-
- https_request = AbstractHTTPHandler.do_request_
-
-class OpenerFactory:
- """This class's interface is quite likely to change."""
-
- default_classes = [
- # handlers
- urllib2.ProxyHandler,
- urllib2.UnknownHandler,
- HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
- urllib2.HTTPDefaultErrorHandler,
- HTTPRedirectHandler, # from this module (bugfixed)
- urllib2.FTPHandler,
- urllib2.FileHandler,
- # processors
- HTTPRequestUpgradeProcessor,
- HTTPCookieProcessor,
- HTTPErrorProcessor
- ]
- handlers = []
- replacement_handlers = []
-
- def __init__(self, klass=_opener.OpenerDirector):
- self.klass = klass
-
- def build_opener(self, *handlers):
- """Create an opener object from a list of handlers and processors.
-
- The opener will use several default handlers and processors, including
- support for HTTP and FTP.
-
- If any of the handlers passed as arguments are subclasses of the
- default handlers, the default handlers will not be used.
-
- """
- opener = self.klass()
- default_classes = list(self.default_classes)
- if hasattr(httplib, 'HTTPS'):
- default_classes.append(HTTPSHandler)
- skip = []
- for klass in default_classes:
- for check in handlers:
- if type(check) == types.ClassType:
- if issubclass(check, klass):
- skip.append(klass)
- elif type(check) == types.InstanceType:
- if isinstance(check, klass):
- skip.append(klass)
- for klass in skip:
- default_classes.remove(klass)
-
- for klass in default_classes:
- opener.add_handler(klass())
- for h in handlers:
- if type(h) == types.ClassType:
- h = h()
- opener.add_handler(h)
-
- return opener
-
-build_opener = OpenerFactory().build_opener
-
-_opener = None
-urlopen_lock = _threading.Lock()
-def urlopen(url, data=None):
- global _opener
- if _opener is None:
- urlopen_lock.acquire()
- try:
- if _opener is None:
- _opener = build_opener()
- finally:
- urlopen_lock.release()
- return _opener.open(url, data)
-
-def urlretrieve(url, filename=None, reporthook=None, data=None):
- global _opener
- if _opener is None:
- urlopen_lock.acquire()
- try:
- if _opener is None:
- _opener = build_opener()
- finally:
- urlopen_lock.release()
- return _opener.retrieve(url, filename, reporthook, data)
-
-def install_opener(opener):
- global _opener
- _opener = opener
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_useragent.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -13,40 +13,31 @@
import sys, warnings, urllib2
-from _opener import OpenerDirector
-
+import _opener
import _urllib2
import _auth
import _gzip
+import _response
-class HTTPRefererProcessor(_urllib2.BaseHandler):
- def http_request(self, request):
- # See RFC 2616 14.36. The only times we know the source of the
- # request URI has a URI associated with it are redirect, and
- # Browser.click() / Browser.submit() / Browser.follow_link().
- # Otherwise, it's the user's job to add any Referer header before
- # .open()ing.
- if hasattr(request, "redirect_dict"):
- request = self.parent._add_referer_header(
- request, origin_request=False)
- return request
-
- https_request = http_request
-
-
-class UserAgent(OpenerDirector):
+class UserAgentBase(_opener.OpenerDirector):
"""Convenient user-agent class.
Do not use .add_handler() to add a handler for something already dealt with
by this code.
+ The only reason at present for the distinction between UserAgent and
+ UserAgentBase is so that classes that depend on .seek()able responses
+ (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
+ UserAgent exposes a .set_seekable_responses() method that allows switching
+ off the adding of a .seek() method to responses.
+
Public attributes:
addheaders: list of (name, value) pairs specifying headers to send with
every request, unless they are overridden in the Request instance.
- >>> ua = UserAgent()
+ >>> ua = UserAgentBase()
>>> ua.addheaders = [
... ("User-agent", "Mozilla/5.0 (compatible)"),
... ("From", "responsible.person at example.com")]
@@ -74,9 +65,7 @@
"_redirect": _urllib2.HTTPRedirectHandler,
"_cookies": _urllib2.HTTPCookieProcessor,
"_refresh": _urllib2.HTTPRefreshProcessor,
- "_referer": HTTPRefererProcessor, # from this module, note
"_equiv": _urllib2.HTTPEquivProcessor,
- "_seek": _urllib2.SeekableProcessor,
"_proxy": _urllib2.ProxyHandler,
"_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
"_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
@@ -92,18 +81,18 @@
default_others = ["_unknown", "_http_error", "_http_request_upgrade",
"_http_default_error",
]
- default_features = ["_redirect", "_cookies", "_referer",
+ default_features = ["_redirect", "_cookies",
"_refresh", "_equiv",
"_basicauth", "_digestauth",
"_proxy", "_proxy_basicauth", "_proxy_digestauth",
- "_seek", "_robots",
+ "_robots",
]
if hasattr(_urllib2, 'HTTPSHandler'):
handler_classes["https"] = _urllib2.HTTPSHandler
default_schemes.append("https")
def __init__(self):
- OpenerDirector.__init__(self)
+ _opener.OpenerDirector.__init__(self)
ua_handlers = self._ua_handlers = {}
for scheme in (self.default_schemes+
@@ -116,7 +105,7 @@
# Yuck.
# Ensure correct default constructor args were passed to
- # HTTPRefererProcessor and HTTPEquivProcessor.
+ # HTTPRefreshProcessor and HTTPEquivProcessor.
if "_refresh" in ua_handlers:
self.set_handle_refresh(True)
if "_equiv" in ua_handlers:
@@ -130,12 +119,13 @@
ppm = _auth.HTTPProxyPasswordMgr()
self.set_password_manager(pm)
self.set_proxy_password_manager(ppm)
+ # set default certificate manager
+ if "https" in ua_handlers:
+ cm = _urllib2.HTTPSClientCertMgr()
+ self.set_client_cert_manager(cm)
- # special case, requires extra support from mechanize.Browser
- self._handle_referer = True
-
def close(self):
- OpenerDirector.close(self)
+ _opener.OpenerDirector.close(self)
self._ua_handlers = None
# XXX
@@ -175,10 +165,6 @@
for scheme in want.keys():
self._set_handler(scheme, True)
- def _add_referer_header(self, request, origin_request=True):
- raise NotImplementedError(
- "this class can't do HTTP Referer: use mechanize.Browser instead")
-
def set_cookiejar(self, cookiejar):
"""Set a mechanize.CookieJar, or None."""
self._set_handler("_cookies", obj=cookiejar)
@@ -200,6 +186,25 @@
self._proxy_password_manager.add_password(
realm, hostport, user, password)
+ def add_client_certificate(self, url, key_file, cert_file):
+ """Add an SSL client certificate, for HTTPS client auth.
+
+ key_file and cert_file must be filenames of the key and certificate
+ files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
+ 12) file to PEM format:
+
+ openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
+ openssl pkcs12 -nocerts -in cert.p12 -out key.pem
+
+
+ Note that client certificate password input is very inflexible ATM. At
+ the moment this seems to be console only, which is presumably the
+ default behaviour of libopenssl. In future mechanize may support
+ third-party libraries that (I assume) allow more options here.
+
+ """
+ self._client_cert_manager.add_key_cert(url, key_file, cert_file)
+
# the following are rarely useful -- use add_password / add_proxy_password
# instead
def set_password_manager(self, password_manager):
@@ -212,6 +217,11 @@
self._proxy_password_manager = password_manager
self._set_handler("_proxy_basicauth", obj=password_manager)
self._set_handler("_proxy_digestauth", obj=password_manager)
+ def set_client_cert_manager(self, cert_manager):
+ """Set a mechanize.HTTPClientCertMgr, or None."""
+ self._client_cert_manager = cert_manager
+ handler = self._ua_handlers["https"]
+ handler.client_cert_manager = cert_manager
# these methods all take a boolean parameter
def set_handle_robots(self, handle):
@@ -227,7 +237,8 @@
def set_handle_equiv(self, handle, head_parser_class=None):
"""Set whether to treat HTML http-equiv headers like HTTP headers.
- Response objects will be .seek()able if this is set.
+ Response objects may be .seek()able if this is set (currently returned
+ responses are, raised HTTPError exception responses are not).
"""
if head_parser_class is not None:
@@ -235,16 +246,6 @@
else:
constructor_kwds={}
self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
- def set_handle_referer(self, handle):
- """Set whether to add Referer header to each request.
-
- This base class does not implement this feature (so don't turn this on
- if you're using this base class directly), but the subclass
- mechanize.Browser does.
-
- """
- self._set_handler("_referer", handle)
- self._handle_referer = bool(handle)
def set_handle_gzip(self, handle):
"""Handle gzip transfer encoding.
@@ -284,6 +285,9 @@
See docstring for .set_debug_redirects() for details of logging.
+ Response objects may be .seek()able if this is set (currently returned
+ responses are, raised HTTPError exception responses are not).
+
"""
self._set_handler("_debug_response_body", handle)
def set_debug_http(self, handle):
@@ -321,3 +325,24 @@
if newhandler is not None:
self.add_handler(newhandler)
self._ua_handlers[name] = newhandler
+
+
+class UserAgent(UserAgentBase):
+
+ def __init__(self):
+ UserAgentBase.__init__(self)
+ self._seekable = False
+
+ def set_seekable_responses(self, handle):
+ """Make response objects .seek()able."""
+ self._seekable = bool(handle)
+
+ def open(self, fullurl, data=None):
+ if self._seekable:
+ def bound_open(fullurl, data=None):
+ return UserAgentBase.open(self, fullurl, data)
+ response = _opener.wrapped_open(
+ bound_open, _response.seek_wrapped_response, fullurl, data)
+ else:
+ response = UserAgentBase.open(self, fullurl, data)
+ return response
Modified: Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py
===================================================================
--- Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py 2007-07-13 11:38:53 UTC (rev 77855)
+++ Zope3/branches/adamg-mechanize-update/src/mechanize/_util.py 2007-07-13 13:21:22 UTC (rev 77856)
@@ -1,4 +1,4 @@
-"""Python backwards-compat., date/time routines, seekable file object wrapper.
+"""Utility functions and date/time routines.
Copyright 2002-2006 John J Lee <jjl at pobox.com>
@@ -8,32 +8,21 @@
"""
-import re, string, time, copy, urllib, mimetools
-from types import TupleType
-from cStringIO import StringIO
+import re, string, time, warnings
-def startswith(string, initial):
- if len(initial) > len(string): return False
- return string[:len(initial)] == initial
+def deprecation(message):
+ warnings.warn(message, DeprecationWarning, stacklevel=3)
+def hide_deprecations():
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
+def reset_deprecations():
+ warnings.filterwarnings('default', category=DeprecationWarning)
-def endswith(string, final):
- if len(final) > len(string): return False
- return string[-len(final):] == final
def isstringlike(x):
try: x+""
except: return False
else: return True
-SPACE_DICT = {}
-for c in string.whitespace:
- SPACE_DICT[c] = None
-del c
-def isspace(string):
- for c in string:
- if not SPACE_DICT.has_key(c): return False
- return True
-
## def caller():
## try:
## raise SyntaxError
@@ -42,33 +31,6 @@
## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
-# this is here rather than in _HeadersUtil as it's just for
-# compatibility with old Python versions, rather than entirely new code
-def getheaders(msg, name):
- """Get all values for a header.
-
- This returns a list of values for headers given more than once; each
- value in the result list is stripped in the same way as the result of
- getheader(). If the header is not given, return an empty list.
- """
- result = []
- current = ''
- have_header = 0
- for s in msg.getallmatchingheaders(name):
- if isspace(s[0]):
- if current:
- current = "%s\n %s" % (current, string.strip(s))
- else:
- current = string.strip(s)
- else:
- if have_header:
- result.append(current)
- current = string.strip(s[string.find(s, ":") + 1:])
- have_header = 1
- if have_header:
- result.append(current)
- return result
-
from calendar import timegm
# Date/time conversion routines for formats used by the HTTP protocol.
@@ -86,7 +48,7 @@
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
months_lower = []
-for month in months: months_lower.append(string.lower(month))
+for month in months: months_lower.append(month.lower())
def time2isoz(t=None):
@@ -144,7 +106,7 @@
# translate month name to number
# month numbers start with 1 (January)
try:
- mon = months_lower.index(string.lower(mon))+1
+ mon = months_lower.index(mon.lower())+1
except ValueError:
# maybe it's already a number
try:
@@ -185,7 +147,7 @@
# adjust time using timezone string, to get absolute time since epoch
if tz is None:
tz = "UTC"
- tz = string.upper(tz)
+ tz = tz.upper()
offset = offset_from_tz_string(tz)
if offset is None:
return None
@@ -247,7 +209,7 @@
m = strict_re.search(text)
if m:
g = m.groups()
- mon = months_lower.index(string.lower(g[1])) + 1
+ mon = months_lower.index(g[1].lower()) + 1
tt = (int(g[2]), mon, int(g[0]),
int(g[3]), int(g[4]), float(g[5]))
return my_timegm(tt)
@@ -255,7 +217,7 @@
# No, we need some messy parsing...
# clean up
- text = string.lstrip(text)
+ text = text.lstrip()
text = wkday_re.sub("", text, 1) # Useless weekday
# tz is time zone specifier string
@@ -300,7 +262,7 @@
"""
# clean up
- text = string.lstrip(text)
+ text = text.lstrip()
# tz is time zone specifier string
day, mon, yr, hr, min, sec, tz = [None]*7
@@ -315,340 +277,3 @@
return None # bad format
return _str2time(day, mon, yr, hr, min, sec, tz)
-
-
-# XXX Andrew Dalke kindly sent me a similar class in response to my request on
-# comp.lang.python, which I then proceeded to lose. I wrote this class
-# instead, but I think he's released his code publicly since, could pinch the
-# tests from it, at least...
-
-# For testing seek_wrapper invariant (note that
-# test_urllib2.HandlerTest.test_seekable is expected to fail when this
-# invariant checking is turned on). The invariant checking is done by module
-# ipdc, which is available here:
-# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
-## from ipdbc import ContractBase
-## class seek_wrapper(ContractBase):
-class seek_wrapper:
- """Adds a seek method to a file object.
-
- This is only designed for seeking on readonly file-like objects.
-
- Wrapped file-like object must have a read method. The readline method is
- only supported if that method is present on the wrapped object. The
- readlines method is always supported. xreadlines and iteration are
- supported only for Python 2.2 and above.
-
- Public attribute: wrapped (the wrapped file object).
-
- WARNING: All other attributes of the wrapped object (ie. those that are not
- one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
- are passed through unaltered, which may or may not make sense for your
- particular file object.
-
- """
- # General strategy is to check that cache is full enough, then delegate to
- # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
- # position (self.__pos) is maintained independently of the cache, in order
- # that a single cache may be shared between multiple seek_wrapper objects.
- # Copying using module copy shares the cache in this way.
-
- def __init__(self, wrapped):
- self.wrapped = wrapped
- self.__have_readline = hasattr(self.wrapped, "readline")
- self.__cache = StringIO()
- self.__pos = 0 # seek position
-
- def invariant(self):
- # The end of the cache is always at the same place as the end of the
- # wrapped file.
- return self.wrapped.tell() == len(self.__cache.getvalue())
-
- def __getattr__(self, name):
- wrapped = self.__dict__.get("wrapped")
- if wrapped:
- return getattr(wrapped, name)
- return getattr(self.__class__, name)
-
- def seek(self, offset, whence=0):
- assert whence in [0,1,2]
-
- # how much data, if any, do we need to read?
- if whence == 2: # 2: relative to end of *wrapped* file
- if offset < 0: raise ValueError("negative seek offset")
- # since we don't know yet where the end of that file is, we must
- # read everything
- to_read = None
- else:
- if whence == 0: # 0: absolute
- if offset < 0: raise ValueError("negative seek offset")
- dest = offset
- else: # 1: relative to current position
- pos = self.__pos
- if pos < offset:
- raise ValueError("seek to before start of file")
- dest = pos + offset
- end = len(self.__cache.getvalue())
- to_read = dest - end
- if to_read < 0:
- to_read = 0
-
- if to_read != 0:
- self.__cache.seek(0, 2)
- if to_read is None:
- assert whence == 2
- self.__cache.write(self.wrapped.read())
- self.__pos = self.__cache.tell() - offset
- else:
- self.__cache.write(self.wrapped.read(to_read))
- # Don't raise an exception even if we've seek()ed past the end
- # of .wrapped, since fseek() doesn't complain in that case.
- # Also like fseek(), pretend we have seek()ed past the end,
- # i.e. not:
- #self.__pos = self.__cache.tell()
- # but rather:
- self.__pos = dest
- else:
- self.__pos = dest
-
- def tell(self):
- return self.__pos
-
- def __copy__(self):
- cpy = self.__class__(self.wrapped)
- cpy.__cache = self.__cache
- return cpy
-
- def get_data(self):
- pos = self.__pos
- try:
- self.seek(0)
- return self.read(-1)
- finally:
- self.__pos = pos
-
- def read(self, size=-1):
- pos = self.__pos
- end = len(self.__cache.getvalue())
- available = end - pos
-
- # enough data already cached?
- if size <= available and size != -1:
- self.__cache.seek(pos)
- self.__pos = pos+size
- return self.__cache.read(size)
-
- # no, so read sufficient data from wrapped file and cache it
- if self.wrapped.read is None:
- # XXX oops, wrapped file-like-object isn't valid, ignore it
- return ''
-
- self.__cache.seek(0, 2)
- if size == -1:
- self.__cache.write(self.wrapped.read())
- else:
- to_read = size - available
- assert to_read > 0
- self.__cache.write(self.wrapped.read(to_read))
- self.__cache.seek(pos)
-
- data = self.__cache.read(size)
- self.__pos = self.__cache.tell()
- assert self.__pos == pos + len(data)
- return data
-
- def readline(self, size=-1):
- if not self.__have_readline:
- raise NotImplementedError("no readline method on wrapped object")
-
- # line we're about to read might not be complete in the cache, so
- # read another line first
- pos = self.__pos
- self.__cache.seek(0, 2)
- self.__cache.write(self.wrapped.readline())
- self.__cache.seek(pos)
-
- data = self.__cache.readline()
- if size != -1:
- r = data[:size]
- self.__pos = pos+size
- else:
- r = data
- self.__pos = pos+len(data)
- return r
-
- def readlines(self, sizehint=-1):
- pos = self.__pos
- self.__cache.seek(0, 2)
- self.__cache.write(self.wrapped.read())
- self.__cache.seek(pos)
- data = self.__cache.readlines(sizehint)
- self.__pos = self.__cache.tell()
- return data
-
- def __iter__(self): return self
- def next(self):
- line = self.readline()
- if line == "": raise StopIteration
- return line
-
- xreadlines = __iter__
-
- def __repr__(self):
- return ("<%s at %s whose wrapped object = %r>" %
- (self.__class__.__name__, hex(id(self)), self.wrapped))
-
-
-class response_seek_wrapper(seek_wrapper):
-
- """
- Supports copying response objects and setting response body data.
-
- """
-
- def __init__(self, wrapped):
- seek_wrapper.__init__(self, wrapped)
- self._headers = self.wrapped.info()
-
- def __copy__(self):
- cpy = seek_wrapper.__copy__(self)
- # copy headers from delegate
- cpy._headers = copy.copy(self.info())
- return cpy
-
- def info(self):
- return self._headers
-
- def set_data(self, data):
- self.seek(0)
- self.read()
- self.close()
- cache = self._seek_wrapper__cache = StringIO()
- cache.write(data)
- self.seek(0)
-
-
-class eoffile:
- # file-like object that always claims to be at end-of-file...
- def read(self, size=-1): return ""
- def readline(self, size=-1): return ""
- def __iter__(self): return self
- def next(self): return ""
- def close(self): pass
-
-class eofresponse(eoffile):
- def __init__(self, url, headers, code, msg):
- self._url = url
- self._headers = headers
- self.code = code
- self.msg = msg
- def geturl(self): return self._url
- def info(self): return self._headers
-
-
-class closeable_response:
- """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
-
- Only supports responses returned by mechanize.HTTPHandler.
-
- After .close(), the following methods are supported:
-
- .read()
- .readline()
- .readlines()
- .seek()
- .tell()
- .info()
- .geturl()
- .__iter__()
- .next()
- .close()
-
- and the following attributes are supported:
-
- .code
- .msg
-
- Also supports pickling (but the stdlib currently does something to prevent
- it: http://python.org/sf/1144636).
-
- """
- # presence of this attr indicates is useable after .close()
- closeable_response = None
-
- def __init__(self, fp, headers, url, code, msg):
- self._set_fp(fp)
- self._headers = headers
- self._url = url
- self.code = code
- self.msg = msg
-
- def _set_fp(self, fp):
- self.fp = fp
- self.read = self.fp.read
- self.readline = self.fp.readline
- if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
- if hasattr(self.fp, "fileno"):
- self.fileno = self.fp.fileno
- else:
- self.fileno = lambda: None
- if hasattr(self.fp, "__iter__"):
- self.__iter__ = self.fp.__iter__
- if hasattr(self.fp, "next"):
- self.next = self.fp.next
-
- def __repr__(self):
- return '<%s at %s whose fp = %r>' % (
- self.__class__.__name__, hex(id(self)), self.fp)
-
- def info(self):
- return self._headers
-
- def geturl(self):
- return self._url
-
- def close(self):
- wrapped = self.fp
- wrapped.close()
- new_wrapped = eofresponse(
- self._url, self._headers, self.code, self.msg)
- self._set_fp(new_wrapped)
-
- def __getstate__(self):
- # There are three obvious options here:
- # 1. truncate
- # 2. read to end
- # 3. close socket, pickle state including read position, then open
- # again on unpickle and use Range header
- # XXXX um, 4. refuse to pickle unless .close()d. This is better,
- # actually ("errors should never pass silently"). Pickling doesn't
- # work anyway ATM, because of http://python.org/sf/1144636 so fix
- # this later
-
- # 2 breaks pickle protocol, because one expects the original object
- # to be left unscathed by pickling. 3 is too complicated and
- # surprising (and too much work ;-) to happen in a sane __getstate__.
- # So we do 1.
-
- state = self.__dict__.copy()
- new_wrapped = eofresponse(
- self._url, self._headers, self.code, self.msg)
- state["wrapped"] = new_wrapped
- return state
-
-def make_response(data, headers, url, code, msg):
- """Convenient factory for objects implementing response interface.
-
- data: string containing response body data
- headers: sequence of (name, value) pairs
- url: URL of response
- code: integer response code (e.g. 200)
- msg: string response code message (e.g. "OK")
-
- """
- hdr_text = []
- for name_value in headers:
- hdr_text.append("%s: %s" % name_value)
- mime_headers = mimetools.Message(StringIO("\n".join(hdr_text)))
- r = closeable_response(StringIO(data), mime_headers, url, code, msg)
- return response_seek_wrapper(r)
More information about the Zope3-Checkins
mailing list