[Zope3-checkins] SVN: Zope3/trunk/src/ - update mechanize and
ClientForm to newest versions
Benji York
benji at zope.com
Wed Jun 21 17:54:08 EDT 2006
Log message for revision 68784:
- update mechanize and ClientForm to newest versions
- remove now unnecessary pullparser and ClientCookie
- apply two bug fixes to mechanize one repairs issue 645
- add regression test for issue 645
- browser.contents now reflects the body returned from the server when an
error occurrs
- fix apidoc use of undocumented error.read method in test, use
browser.contents instead
Changed:
D Zope3/trunk/src/ClientCookie/
U Zope3/trunk/src/ClientForm.py
U Zope3/trunk/src/mechanize/__init__.py
A Zope3/trunk/src/mechanize/_auth.py
A Zope3/trunk/src/mechanize/_clientcookie.py
A Zope3/trunk/src/mechanize/_gzip.py
A Zope3/trunk/src/mechanize/_headersutil.py
A Zope3/trunk/src/mechanize/_html.py
A Zope3/trunk/src/mechanize/_lwpcookiejar.py
U Zope3/trunk/src/mechanize/_mechanize.py
A Zope3/trunk/src/mechanize/_mozillacookiejar.py
A Zope3/trunk/src/mechanize/_msiecookiejar.py
A Zope3/trunk/src/mechanize/_opener.py
A Zope3/trunk/src/mechanize/_pullparser.py
A Zope3/trunk/src/mechanize/_request.py
A Zope3/trunk/src/mechanize/_urllib2.py
A Zope3/trunk/src/mechanize/_urllib2_support.py
U Zope3/trunk/src/mechanize/_useragent.py
A Zope3/trunk/src/mechanize/_util.py
D Zope3/trunk/src/pullparser.py
U Zope3/trunk/src/zope/app/apidoc/browser/README.txt
U Zope3/trunk/src/zope/testbrowser/DEPENDENCIES.cfg
U Zope3/trunk/src/zope/testbrowser/README.txt
U Zope3/trunk/src/zope/testbrowser/browser.py
U Zope3/trunk/src/zope/testbrowser/testing.py
U Zope3/trunk/src/zope/testbrowser/tests.py
-=-
Modified: Zope3/trunk/src/ClientForm.py
===================================================================
--- Zope3/trunk/src/ClientForm.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/ClientForm.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -15,7 +15,7 @@
HTML 4.01 Specification, W3C Recommendation 24 December 1999
-Copyright 2002-2005 John J. Lee <jjl at pobox.com>
+Copyright 2002-2006 John J. Lee <jjl at pobox.com>
Copyright 2005 Gary Poster
Copyright 2005 Zope Corporation
Copyright 1998-2000 Gisle Aas.
@@ -27,44 +27,40 @@
"""
# XXX
+# Remove unescape_attr method
+# Remove parser testing hack
+# safeUrl()-ize action
+# Really should to merge CC, CF, pp and mechanize as soon as mechanize
+# goes to beta...
+# Add url attribute to ParseError
+# Switch to unicode throughout (would be 0.3.x)
+# See Wichert Akkerman's 2004-01-22 message to c.l.py.
+# Add charset parameter to Content-type headers? How to find value??
# Add some more functional tests
# Especially single and multiple file upload on the internet.
# Does file upload work when name is missing? Sourceforge tracker form
# doesn't like it. Check standards, and test with Apache. Test
# binary upload with Apache.
-# There have been reports that some servers are very picky about MIME
-# boundaries, so file uploads may fail with those servers. Should
-# copy what IE does religiously.
-# Unicode: see Wichert Akkerman's 2004-01-22 message to c.l.py.
# Controls can have name=None (e.g. forms constructed partly with
# JavaScript), but find_control can't be told to find a control
# with that name, because None there means 'unspecified'. Can still
# get at by nr, but would be nice to be able to specify something
# equivalent to name=None, too.
-# Deal with character sets properly. Not sure what the issues are here.
-# Do URL encodings need any attention?
-# I don't *think* any encoding of control names, filenames or data is
-# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6
-# doesn't seem to do it.
-# Add charset parameter to Content-type headers? How to find value??
# mailto submission & enctype text/plain
# I'm not going to fix this unless somebody tells me what real servers
# that want this encoding actually expect: If enctype is
# application/x-www-form-urlencoded and there's a FILE control present.
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
-# Get rid of MimeWriter.
-# Should really use sgmllib, not htmllib.
# Would be nice, but I'm not going to do it myself:
# -------------------------------------------------
-# Maybe a 0.3.x?
+# Maybe a 0.4.x?
# Replace by_label etc. with moniker / selector concept. Allows, eg.,
# a choice between selection by value / id / label / element
# contents. Or choice between matching labels exactly or by
# substring. Etc.
# Remove deprecated methods.
-# action should probably be an absolute URI, like DOMForm.
# ...what else?
# Work on DOMForm.
# XForms? Don't know if there's a need here.
@@ -81,8 +77,38 @@
if expr: return True
else: return False
+try:
+ import logging
+except ImportError:
+ def debug(msg, *args, **kwds):
+ pass
+else:
+ _logger = logging.getLogger("ClientForm")
+ OPTIMIZATION_HACK = True
+
+ def debug(msg, *args, **kwds):
+ if OPTIMIZATION_HACK:
+ return
+
+ try:
+ raise Exception()
+ except:
+ caller_name = (
+ sys.exc_info()[2].tb_frame.f_back.f_back.f_code.co_name)
+ extended_msg = '%%s %s' % msg
+ extended_args = (caller_name,)+args
+ debug = _logger.debug(extended_msg, *extended_args, **kwds)
+
+ def _show_debug_messages():
+ global OPTIMIZATION_HACK
+ OPTIMIZATION_HACK = False
+ _logger.setLevel(logging.DEBUG)
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setLevel(logging.DEBUG)
+ _logger.addHandler(handler)
+
import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
- htmlentitydefs, re
+ htmlentitydefs, re, random
from urlparse import urljoin
from cStringIO import StringIO
@@ -95,10 +121,12 @@
def deprecation(message):
warnings.warn(message, DeprecationWarning, stacklevel=2)
-VERSION = "0.2.1a"
+VERSION = "0.2.2"
CHUNK = 1024 # size of chunks fed to parser, in bytes
+DEFAULT_ENCODING = "latin-1"
+
_compress_re = re.compile(r"\s+")
def compress_text(text): return _compress_re.sub(" ", text.strip())
@@ -171,15 +199,62 @@
l.append(k + '=' + urllib.quote_plus(str(elt)))
return '&'.join(l)
-def unescape(data, entities):
- if data is None or '&' not in data:
+def unescape(data, entities, encoding=DEFAULT_ENCODING):
+ if data is None or "&" not in data:
return data
- def replace_entities(match, entities=entities):
+
+ def replace_entities(match, entities=entities, encoding=encoding):
ent = match.group()
- repl = entities.get(ent, ent)
+ if ent[1] == "#":
+ return unescape_charref(ent[2:-1], encoding)
+
+ repl = entities.get(ent)
+ if repl is not None:
+ if type(repl) != type(""):
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
+ else:
+ repl = ent
+
return repl
- return re.sub(r'&\S+?;', replace_entities, data)
+ return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith("x"):
+ name, base= name[1:], 16
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
+
+def get_entitydefs():
+ import htmlentitydefs
+ from codecs import latin_1_decode
+ entitydefs = {}
+ try:
+ htmlentitydefs.name2codepoint
+ except AttributeError:
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ uc = latin_1_decode(char)[0]
+ if uc.startswith("&#") and uc.endswith(";"):
+ uc = unescape_charref(uc[2:-1], None)
+ entitydefs["&%s;" % name] = uc
+ else:
+ for name, codepoint in htmlentitydefs.name2codepoint.items():
+ entitydefs["&%s;" % name] = unichr(codepoint)
+ return entitydefs
+
+
def issequence(x):
try:
x[0]
@@ -195,74 +270,15 @@
else: return True
-# XXX don't really want to drag this along (MimeWriter, choose_boundary)
-
-# --------------------------------------------------------------------
-# grabbed from Python standard library mimetools module and tweaked to
-# avoid socket.gaierror and to avoid dots ('.') in MIME boundaries
-try:
- import thread
- _thread = thread; del thread
-except ImportError:
- import dummy_thread
- _thread = dummy_thread; del dummy_thread
-_counter_lock = _thread.allocate_lock()
-del _thread
-
-_counter = 0
-def _get_next_counter():
- global _counter
- _counter_lock.acquire()
- _counter = _counter + 1
- result = _counter
- _counter_lock.release()
- return result
-
-_prefix = None
-
def choose_boundary():
- """Return a string usable as a multipart boundary.
+ """Return a string usable as a multipart boundary."""
+ # follow IE and firefox
+ nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
+ return "-"*27 + nonce
- The string chosen is unique within a single program run, and
- incorporates the user id (if available), process id (if available),
- and current time. So it's very unlikely the returned string appears
- in message text, but there's no guarantee.
-
- The boundary contains dots so you have to quote it in the header."""
-
- global _prefix
- import time
- import os
- import socket
- if _prefix is None:
- try:
- socket.gaierror
- except AttributeError:
- exc = socket.error
- else:
- exc = socket.gaierror
-
- try:
- hostid = socket.gethostbyname(socket.gethostname())
- except exc:
- hostid = 'localhost'
- try:
- uid = repr(os.getuid())
- except AttributeError:
- uid = '1'
- try:
- pid = repr(os.getpid())
- except AttributeError:
- pid = '1'
- _prefix = hostid + uid + pid
- return "%s%d%d" % (_prefix, long(time.time()*100), _get_next_counter())
-
-# end of code from mimetools module
-# --------------------------------------------------------------------
-
# This cut-n-pasted MimeWriter from standard library is here so can add
# to HTTP headers rather than message body when appropriate. It also uses
-# \r\n in place of \n. This is nasty.
+# \r\n in place of \n. This is a bit nasty.
class MimeWriter:
"""Generic MIME writer.
@@ -420,10 +436,11 @@
class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion."""
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
- def __init__(self, entitydefs=None):
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
if entitydefs is None:
entitydefs = get_entitydefs()
self._entitydefs = entitydefs
+ self._encoding = encoding
self.base = None
self.forms = []
@@ -436,17 +453,20 @@
self._textarea = None
def do_base(self, attrs):
+ debug("%s", attrs)
for key, value in attrs:
if key == "href":
self.base = value
def end_body(self):
+ debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is not None:
self.end_form()
def start_form(self, attrs):
+ debug("%s", attrs)
if self._current_form is not None:
raise ParseError("nested FORMs")
name = None
@@ -468,6 +488,7 @@
self._current_form = (name, action, method, enctype), d, controls
def end_form(self):
+ debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is None:
@@ -476,6 +497,7 @@
self._current_form = None
def start_select(self, attrs):
+ debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of SELECT before start of FORM")
if self._select is not None:
@@ -492,6 +514,7 @@
self._append_select_control({"__select": d})
def end_select(self):
+ debug("")
if self._current_form is None:
raise ParseError("end of SELECT before start of FORM")
if self._select is None:
@@ -503,6 +526,7 @@
self._select = None
def start_optgroup(self, attrs):
+ debug("%s", attrs)
if self._select is None:
raise ParseError("OPTGROUP outside of SELECT")
d = {}
@@ -512,11 +536,13 @@
self._optgroup = d
def end_optgroup(self):
+ debug("")
if self._optgroup is None:
raise ParseError("end of OPTGROUP before start")
self._optgroup = None
def _start_option(self, attrs):
+ debug("%s", attrs)
if self._select is None:
raise ParseError("OPTION outside of SELECT")
if self._option is not None:
@@ -533,6 +559,7 @@
self._option["disabled"] = None
def _end_option(self):
+ debug("")
if self._option is None:
raise ParseError("end of OPTION before start")
@@ -549,11 +576,13 @@
self._option = None
def _append_select_control(self, attrs):
+ debug("%s", attrs)
controls = self._current_form[2]
name = self._select.get("name")
controls.append(("select", name, attrs))
def start_textarea(self, attrs):
+ debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of TEXTAREA before start of FORM")
if self._textarea is not None:
@@ -568,6 +597,7 @@
self._textarea = d
def end_textarea(self):
+ debug("")
if self._current_form is None:
raise ParseError("end of TEXTAREA before start of FORM")
if self._textarea is None:
@@ -578,6 +608,7 @@
self._textarea = None
def start_label(self, attrs):
+ debug("%s", attrs)
if self._current_label:
self.end_label()
d = {}
@@ -591,6 +622,7 @@
self._current_label = d
def end_label(self):
+ debug("")
label = self._current_label
if label is None:
# something is ugly in the HTML, but we're ignoring it
@@ -601,6 +633,7 @@
del label["__taken"]
def _add_label(self, d):
+ #debug("%s", d)
if self._current_label is not None:
if self._current_label["__taken"]:
self.end_label() # be fuzzy
@@ -609,6 +642,7 @@
d["__label"] = self._current_label
def handle_data(self, data):
+ debug("%s", data)
if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the
@@ -632,6 +666,7 @@
map[key] = map[key] + data
def do_button(self, attrs):
+ debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of BUTTON before start of FORM")
d = {}
@@ -651,6 +686,7 @@
controls.append((type, name, d))
def do_input(self, attrs):
+ debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of INPUT before start of FORM")
d = {}
@@ -665,6 +701,7 @@
controls.append((type, name, d))
def do_isindex(self, attrs):
+ debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of ISINDEX before start of FORM")
d = {}
@@ -677,18 +714,20 @@
controls.append(("isindex", None, d))
def handle_entityref(self, name):
- table = self._entitydefs
- fullname = "&%s;" % name
- if table.has_key(fullname):
- self.handle_data(table[fullname])
- else:
- self.unknown_entityref(name)
- return
+ #debug("%s", name)
+ self.handle_data(unescape(
+ '&%s;' % name, self._entitydefs, self._encoding))
+ def handle_charref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape_charref(name, self._encoding))
+
def unescape_attr(self, name):
- return unescape(name, self._entitydefs)
+ #debug("%s", name)
+ return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs):
+ #debug("%s", attrs)
escaped_attrs = {}
for key, val in attrs.items():
try:
@@ -710,15 +749,15 @@
import HTMLParser
except ImportError:
class XHTMLCompatibleFormParser:
- def __init__(self, entitydefs=None):
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
raise ValueError("HTMLParser could not be imported")
else:
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
"""Good for XHTML, bad for tolerance of incorrect HTML."""
# thanks to Michael Howitz for this!
- def __init__(self, entitydefs=None):
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
HTMLParser.HTMLParser.__init__(self)
- _AbstractFormParser.__init__(self, entitydefs)
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
@@ -747,18 +786,6 @@
else:
method()
- # taken from sgmllib, with changes
- def handle_charref(self, name):
- try:
- n = int(name)
- except ValueError:
- self.unknown_charref(name)
- return
- if not 0 <= n <= 255:
- self.unknown_charref(name)
- return
- self.handle_data(chr(n))
-
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
@@ -769,13 +796,10 @@
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
-import htmllib, formatter
-class FormParser(_AbstractFormParser, htmllib.HTMLParser):
- """Good for tolerance of incorrect HTML, bad for XHTML."""
- def __init__(self, entitydefs=None):
- htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
- _AbstractFormParser.__init__(self, entitydefs)
-
+import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
+class _AbstractSgmllibParser(_AbstractFormParser):
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
@@ -784,19 +808,52 @@
def unescape_attrs_if_required(self, attrs):
return self.unescape_attrs(attrs)
+class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
+ """Good for tolerance of incorrect HTML, bad for XHTML."""
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ sgmllib.SGMLParser.__init__(self)
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
+
+try:
+ if sys.version_info[:2] < (2, 2):
+ raise ImportError # BeautifulSoup uses generators
+ import BeautifulSoup
+except ImportError:
+ pass
+else:
+ class _AbstractBSFormParser(_AbstractSgmllibParser):
+ bs_base_class = None
+ def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
+ _AbstractFormParser.__init__(self, entitydefs, encoding)
+ self.bs_base_class.__init__(self)
+ def handle_data(self, data):
+ _AbstractFormParser.handle_data(self, data)
+ self.bs_base_class.handle_data(self, data)
+
+ class RobustFormParser(_AbstractBSFormParser, BeautifulSoup.BeautifulSoup):
+ """Tries to be highly tolerant of incorrect HTML."""
+ bs_base_class = BeautifulSoup.BeautifulSoup
+ class NestingRobustFormParser(_AbstractBSFormParser,
+ BeautifulSoup.ICantBelieveItsBeautifulSoup):
+ """Tries to be highly tolerant of incorrect HTML.
+
+ Different from RobustFormParser in that it more often guesses nesting
+ above missing end tags (see BeautifulSoup docs).
+
+ """
+ bs_base_class = BeautifulSoup.ICantBelieveItsBeautifulSoup
+
#FormParser = XHTMLCompatibleFormParser # testing hack
+#FormParser = RobustFormParser # testing hack
-def get_entitydefs():
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- entitydefs["&%s;" % name] = char
- return entitydefs
-
def ParseResponse(response, select_default=False,
ignore_errors=False, # ignored!
form_parser_class=FormParser,
request_class=urllib2.Request,
- entitydefs=None, backwards_compat=True):
+ entitydefs=None,
+ backwards_compat=True,
+ encoding=DEFAULT_ENCODING,
+ ):
"""Parse HTTP response and return a list of HTMLForm instances.
The return value of urllib2.urlopen can be conveniently passed to this
@@ -811,11 +868,17 @@
form_parser_class: class to instantiate and use to pass
request_class: class to return from .click() method (default is
urllib2.Request)
- entitydefs: mapping like {'&': '&', ...} containing HTML entity
+ entitydefs: mapping like {"&": "&", ...} containing HTML entity
definitions (a sensible default is used)
+ encoding: character encoding used for encoding numeric character references
+ when matching link text. ClientForm does not attempt to find the encoding
+ in a META HTTP-EQUIV attribute in the document itself (mechanize, for
+ example, does do that and will pass the correct value to ClientForm using
+ this parameter).
backwards_compat: boolean that determines whether the returned HTMLForm
- objects are backwards-compatible with old code. If backwards_compat is True:
+ objects are backwards-compatible with old code. If backwards_compat is
+ true:
- ClientForm 0.1 code will continue to work as before.
@@ -844,7 +907,7 @@
There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
- htmllib.HTMLParser) (the default) works best for ordinary grubby HTML.
+ sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
Note that HTMLParser is only available in Python 2.2 and later. You can
pass your own class in here as a hack to work around bad HTML, but at your
own risk: there is no well-defined interface.
@@ -854,13 +917,19 @@
False,
form_parser_class,
request_class,
- entitydefs, backwards_compat)
+ entitydefs,
+ backwards_compat,
+ encoding,
+ )
def ParseFile(file, base_uri, select_default=False,
ignore_errors=False, # ignored!
form_parser_class=FormParser,
request_class=urllib2.Request,
- entitydefs=None, backwards_compat=True):
+ entitydefs=None,
+ backwards_compat=True,
+ encoding=DEFAULT_ENCODING,
+ ):
"""Parse HTML and return a list of HTMLForm instances.
ClientForm.ParseError is raised on parse errors.
@@ -876,7 +945,7 @@
"""
if backwards_compat:
deprecation("operating in backwards-compatibility mode")
- fp = form_parser_class(entitydefs)
+ fp = form_parser_class(entitydefs, encoding)
while 1:
data = file.read(CHUNK)
try:
@@ -916,8 +985,9 @@
type, name, attrs = controls[ii]
attrs = fp.unescape_attrs_if_required(attrs)
name = fp.unescape_attr_if_required(name)
+ # index=ii*10 allows ImageControl to return multiple ordered pairs
form.new_control(type, name, attrs, select_default=select_default,
- index=ii)
+ index=ii*10)
forms.append(form)
for form in forms:
form.fixup()
@@ -930,7 +1000,7 @@
self._text = attrs.get("__text").strip()
self._ctext = compress_text(self._text)
self.attrs = attrs
- self._backwards_compat = False # maintaned by HTMLForm
+ self._backwards_compat = False # maintained by HTMLForm
def __getattr__(self, name):
if name == "text":
@@ -942,15 +1012,15 @@
def __setattr__(self, name, value):
if name == "text":
- # don't see any need for this
+ # don't see any need for this, so make it read-only
raise AttributeError("text attribute is read-only")
self.__dict__[name] = value
def __str__(self):
- return '<Label(id=%r, text=%r)>' % (self.id, self.text)
+ return "<Label(id=%r, text=%r)>" % (self.id, self.text)
-def _getLabel(attrs):
+def _get_label(attrs):
text = attrs.get("__label")
if text is not None:
return Label(text)
@@ -1049,15 +1119,14 @@
"""
raise NotImplementedError()
- def _write_mime_data(self, mw):
- """Write data for this control to a MimeWriter."""
+ def _write_mime_data(self, mw, name, value):
+ """Write data for a subitem of this control to a MimeWriter."""
# called by HTMLForm
- for name, value in self.pairs():
- mw2 = mw.nextpart()
- mw2.addheader("Content-disposition",
- 'form-data; name="%s"' % name, 1)
- f = mw2.startbody(prefix=0)
- f.write(value)
+ mw2 = mw.nextpart()
+ mw2.addheader("Content-disposition",
+ 'form-data; name="%s"' % name, 1)
+ f = mw2.startbody(prefix=0)
+ f.write(value)
def __str__(self):
raise NotImplementedError()
@@ -1093,7 +1162,7 @@
"""
def __init__(self, type, name, attrs, index=None):
self._index = index
- self._label = _getLabel(attrs)
+ self._label = _get_label(attrs)
self.__dict__["type"] = type.lower()
self.__dict__["name"] = name
self._value = attrs.get("value")
@@ -1161,7 +1230,6 @@
INPUT/TEXT
INPUT/PASSWORD
- INPUT/FILE
INPUT/HIDDEN
TEXTAREA
@@ -1219,8 +1287,9 @@
return []
return [(self._index, self.name, "")]
- def _write_mime_data(self, mw):
+ def _write_mime_data(self, mw, _name, _value):
# called by HTMLForm
+ # assert _name == self.name and _value == ''
if len(self._upload_data) == 1:
# single file
file_object, content_type, filename = self._upload_data[0]
@@ -1381,7 +1450,7 @@
class Item:
def __init__(self, control, attrs, index=None):
- label = _getLabel(attrs)
+ label = _get_label(attrs)
self.__dict__.update({
"name": attrs["value"],
"_labels": label and [label] or [],
@@ -1793,7 +1862,7 @@
def merge_control(self, control):
assert bool(control.multiple) == bool(self.multiple)
- #assert isinstance(control, self.__class__)
+ # usually, isinstance(control, self.__class__)
self.items.extend(control.items)
def fixup(self):
@@ -2084,6 +2153,12 @@
SELECT (and OPTION)
+
+ OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
+
+
+ OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
+
SELECT control values and labels are subject to some messy defaulting
rules. For example, if the HTML representation of the control is:
@@ -2094,9 +2169,9 @@
</SELECT>
The items, in order, have labels "2002", "2001" and "2000", whereas their
- values are "0", "1" and "2000" respectively. Note that the value of the
- last OPTION in this example defaults to its contents, as specified by RFC
- 1866, as do the labels of the second and third OPTIONs.
+ names (the OPTION values) are "0", "1" and "2000" respectively. Note that
+ the value of the last OPTION in this example defaults to its contents, as
+ specified by RFC 1866, as do the labels of the second and third OPTIONs.
The OPTION labels are sometimes more meaningful than the OPTION values,
which can make for more maintainable code.
@@ -2106,14 +2181,13 @@
The attrs attribute is a dictionary of the original HTML attributes of the
SELECT element. Other ListControls do not have this attribute, because in
other cases the control as a whole does not correspond to any single HTML
- element. The get_item_attrs method may be used as usual to get at the
- HTML attributes of the HTML elements corresponding to individual list items
- (for SELECT controls, these are OPTION elements).
+ element. control.get(...).attrs may be used as usual to get at the HTML
+ attributes of the HTML elements corresponding to individual list items (for
+ SELECT controls, these are OPTION elements).
- Another special case is that the attributes dictionaries returned by
- get_item_attrs have a special key "contents" which does not correspond to
- any real HTML attribute, but rather contains the contents of the OPTION
- element:
+ Another special case is that the Item.attrs dictionaries have a special key
+ "contents" which does not correspond to any real HTML attribute, but rather
+ contains the contents of the OPTION element:
<OPTION>this bit</OPTION>
@@ -2136,7 +2210,7 @@
# fish out the SELECT HTML attributes from the OPTION HTML attributes
# dictionary
self.attrs = attrs["__select"].copy()
- self.__dict__["_label"] = _getLabel(self.attrs)
+ self.__dict__["_label"] = _get_label(self.attrs)
self.__dict__["id"] = self.attrs.get("id")
self.__dict__["multiple"] = self.attrs.has_key("multiple")
# the majority of the contents, label, and value dance already happened
@@ -2169,14 +2243,19 @@
def fixup(self):
ListControl.fixup(self)
# Firefox doesn't exclude disabled items from those considered here
- # (i.e. from 'found', for both brances of the if below). Note that IE
- # doesn't support the disabled attribute on OPTIONs at all.
+ # (i.e. from 'found', for both branches of the if below). Note that
+ # IE6 doesn't support the disabled attribute on OPTIONs at all.
found = [o for o in self.items if o.selected]
if not found:
if not self.multiple or self._select_default:
for o in self.items:
if not o.disabled:
- o.selected = True
+ was_disabled = self.disabled
+ self.disabled = False
+ try:
+ o.selected = True
+ finally:
+ o.disabled = was_disabled
break
elif not self.multiple:
# Ensure only one item selected. Choose the last one,
@@ -2245,11 +2324,11 @@
if name is None: return []
pairs = [
(self._index, "%s.x" % name, str(clicked[0])),
- (self._index, "%s.y" % name, str(clicked[1])),
+ (self._index+1, "%s.y" % name, str(clicked[1])),
]
value = self._value
if value:
- pairs.append((self._index, name, value))
+ pairs.append((self._index+2, name, value))
return pairs
get_labels = ScalarControl.get_labels
@@ -2301,8 +2380,10 @@
need to be more specific than just supplying the control's name, use the
set_value and get_value methods.
- ListControl values are lists of item names. The list item's name is the
- value of the corresponding HTML element's "value" attribute.
+ ListControl values are lists of item names (specifically, the names of the
+ items that are selected and not disabled, and hence are "successful" -- ie.
+ cause data to be returned to the server). The list item's name is the
+ value of the corresponding HTML element's"value" attribute.
Example:
@@ -2321,11 +2402,12 @@
defines a SELECT control with name "more_cheeses" which has two items,
named "1" and "2" (because the OPTION element's value HTML attribute
- defaults to the element contents).
+ defaults to the element contents -- see SelectControl.__doc__ for more on
+ these defaulting rules).
To select, deselect or otherwise manipulate individual list items, use the
HTMLForm.find_control() and ListControl.get() methods. To set the whole
- value, do as for any other control:use indexing or the set_/get_value
+ value, do as for any other control: use indexing or the set_/get_value
methods.
Example:
@@ -2611,7 +2693,9 @@
#---------------------------------------------------
def __str__(self):
- header = "%s %s %s" % (self.method, self.action, self.enctype)
+ header = "%s%s %s %s" % (
+ (self.name and self.name+" " or ""),
+ self.method, self.action, self.enctype)
rep = [header]
for control in self.controls:
rep.append(" %s" % str(control))
@@ -3054,17 +3138,23 @@
def _pairs(self):
"""Return sequence of (key, value) pairs suitable for urlencoding."""
- opairs = []
- for control in self.controls:
- opairs.extend(control._totally_ordered_pairs())
+ return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
+
+ def _pairs_and_controls(self):
+ """Return sequence of (index, key, value, control_index)
+ of totally ordered pairs suitable for urlencoding.
+
+ control_index is the index of the control in self.controls
+ """
+ pairs = []
+ for control_index in range(len(self.controls)):
+ control = self.controls[control_index]
+ for ii, key, val in control._totally_ordered_pairs():
+ pairs.append((ii, key, val, control_index))
+
# stable sort by ONLY first item in tuple
- sorter = []
- for jj in range(len(opairs)):
- ii, key, val = opairs[jj]
- sorter.append((ii, jj, key, val))
- sorter.sort()
- pairs = [(key, val) for (ii, jj, key, val) in sorter]
+ pairs.sort()
return pairs
@@ -3094,8 +3184,8 @@
mw = MimeWriter(data, http_hdrs)
f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
prefix=0)
- for control in self.controls:
- control._write_mime_data(mw)
+ for ii, k, v, control_index in self._pairs_and_controls():
+ self.controls[control_index]._write_mime_data(mw, k, v)
mw.lastpart()
return uri, data.getvalue(), http_hdrs
else:
@@ -3116,7 +3206,7 @@
req = request_class(req_data[0], req_data[1])
for key, val in req_data[2]:
add_hdr = req.add_header
- if key.lower() == 'content-type':
+ if key.lower() == "content-type":
try:
add_hdr = req.add_unredirected_header
except AttributeError:
Modified: Zope3/trunk/src/mechanize/__init__.py
===================================================================
--- Zope3/trunk/src/mechanize/__init__.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/__init__.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -1,4 +1,39 @@
-from _useragent import UserAgent#, http_get, http_put, http_head
-from _mechanize import Browser, Link, FormsFactory, \
- BrowserStateError, LinkNotFoundError, FormNotFoundError, \
- __version__
+from _mechanize import __version__
+
+# high-level stateful browser-style interface
+from _mechanize import \
+ Browser, \
+ BrowserStateError, LinkNotFoundError, FormNotFoundError
+
+# configurable URL-opener interface
+from _useragent import UserAgent
+from _html import \
+ Link, \
+ Factory, DefaultFactory, RobustFactory, \
+ FormsFactory, LinksFactory, TitleFactory, \
+ RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
+
+# urllib2 work-alike interface (part from mechanize, part from urllib2)
+from _urllib2 import *
+
+# misc
+from _util import http2time as str2time
+from _util import response_seek_wrapper, make_response
+from _urllib2_support import HeadParser
+try:
+ from _urllib2_support import XHTMLCompatibleHeadParser
+except ImportError:
+ pass
+#from _gzip import HTTPGzipProcessor # crap ATM
+
+
+# cookies
+from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
+ CookieJar, FileCookieJar, LoadError, request_host
+from _lwpcookiejar import LWPCookieJar, lwp_cookie_str
+from _mozillacookiejar import MozillaCookieJar
+from _msiecookiejar import MSIECookieJar
+
+# If you hate the idea of turning bugs into warnings, do:
+# import mechanize; mechanize.USE_BARE_EXCEPT = False
+USE_BARE_EXCEPT = True
Added: Zope3/trunk/src/mechanize/_auth.py
===================================================================
--- Zope3/trunk/src/mechanize/_auth.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_auth.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,471 @@
+"""HTTP Authentication and Proxy support.
+
+All but HTTPProxyPasswordMgr come from Python 2.5.
+
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+import re, base64, urlparse, posixpath, md5, sha
+
+from urllib2 import BaseHandler
+from urllib import getproxies, unquote, splittype, splituser, splitpasswd
+
+
+def _parse_proxy(proxy):
+ """Return (scheme, user, password, host/port) given a URL or an authority.
+
+ If a URL is supplied, it must have an authority (host:port) component.
+ According to RFC 3986, having an authority component means the URL must
+ have two slashes after the scheme:
+
+ >>> _parse_proxy('file:/ftp.example.com/')
+ Traceback (most recent call last):
+ ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
+
+ The first three items of the returned tuple may be None.
+
+ Examples of authority parsing:
+
+ >>> _parse_proxy('proxy.example.com')
+ (None, None, None, 'proxy.example.com')
+ >>> _parse_proxy('proxy.example.com:3128')
+ (None, None, None, 'proxy.example.com:3128')
+
+ The authority component may optionally include userinfo (assumed to be
+ username:password):
+
+ >>> _parse_proxy('joe:password at proxy.example.com')
+ (None, 'joe', 'password', 'proxy.example.com')
+ >>> _parse_proxy('joe:password at proxy.example.com:3128')
+ (None, 'joe', 'password', 'proxy.example.com:3128')
+
+ Same examples, but with URLs instead:
+
+ >>> _parse_proxy('http://proxy.example.com/')
+ ('http', None, None, 'proxy.example.com')
+ >>> _parse_proxy('http://proxy.example.com:3128/')
+ ('http', None, None, 'proxy.example.com:3128')
+ >>> _parse_proxy('http://joe:password@proxy.example.com/')
+ ('http', 'joe', 'password', 'proxy.example.com')
+ >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
+ ('http', 'joe', 'password', 'proxy.example.com:3128')
+
+ Everything after the authority is ignored:
+
+ >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
+ ('ftp', 'joe', 'password', 'proxy.example.com')
+
+ Test for no trailing '/' case:
+
+ >>> _parse_proxy('http://joe:password@proxy.example.com')
+ ('http', 'joe', 'password', 'proxy.example.com')
+
+ """
+ scheme, r_scheme = splittype(proxy)
+ if not r_scheme.startswith("/"):
+ # authority
+ scheme = None
+ authority = proxy
+ else:
+ # URL
+ if not r_scheme.startswith("//"):
+ raise ValueError("proxy URL with no authority: %r" % proxy)
+ # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
+ # and 3.3.), path is empty or starts with '/'
+ end = r_scheme.find("/", 2)
+ if end == -1:
+ end = None
+ authority = r_scheme[2:end]
+ userinfo, hostport = splituser(authority)
+ if userinfo is not None:
+ user, password = splitpasswd(userinfo)
+ else:
+ user = password = None
+ return scheme, user, password, hostport
+
+class ProxyHandler(BaseHandler):
+ # Proxies must be in front
+ handler_order = 100
+
+ def __init__(self, proxies=None):
+ if proxies is None:
+ proxies = getproxies()
+ assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
+ self.proxies = proxies
+ for type, url in proxies.items():
+ setattr(self, '%s_open' % type,
+ lambda r, proxy=url, type=type, meth=self.proxy_open: \
+ meth(r, proxy, type))
+
+ def proxy_open(self, req, proxy, type):
+ orig_type = req.get_type()
+ proxy_type, user, password, hostport = _parse_proxy(proxy)
+ if proxy_type is None:
+ proxy_type = orig_type
+ if user and password:
+ user_pass = '%s:%s' % (unquote(user), unquote(password))
+ creds = base64.encodestring(user_pass).strip()
+ req.add_header('Proxy-authorization', 'Basic ' + creds)
+ hostport = unquote(hostport)
+ req.set_proxy(hostport, proxy_type)
+ if orig_type == proxy_type:
+ # let other handlers take care of it
+ return None
+ else:
+ # need to start over, because the other handlers don't
+ # grok the proxy's URL type
+ # e.g. if we have a constructor arg proxies like so:
+ # {'http': 'ftp://proxy.example.com'}, we may end up turning
+ # a request for http://acme.example.com/a into one for
+ # ftp://proxy.example.com/a
+ return self.parent.open(req)
+
+class HTTPPasswordMgr:
+
+ def __init__(self):
+ self.passwd = {}
+
+ def add_password(self, realm, uri, user, passwd):
+ # uri could be a single URI or a sequence
+ if isinstance(uri, basestring):
+ uri = [uri]
+ uri = tuple(map(self.reduce_uri, uri))
+ if not realm in self.passwd:
+ self.passwd[realm] = {}
+ self.passwd[realm][uri] = (user, passwd)
+
+ def find_user_password(self, realm, authuri):
+ domains = self.passwd.get(realm, {})
+ authuri = self.reduce_uri(authuri)
+ for uris, authinfo in domains.iteritems():
+ for uri in uris:
+ if self.is_suburi(uri, authuri):
+ return authinfo
+ return None, None
+
+ def reduce_uri(self, uri):
+ """Accept netloc or URI and extract only the netloc and path"""
+ parts = urlparse.urlsplit(uri)
+ if parts[1]:
+ # URI
+ return parts[1], parts[2] or '/'
+ elif parts[0]:
+ # host:port
+ return uri, '/'
+ else:
+ # host
+ return parts[2], '/'
+
+ def is_suburi(self, base, test):
+ """Check if test is below base in a URI tree
+
+ Both args must be URIs in reduced form.
+ """
+ if base == test:
+ return True
+ if base[0] != test[0]:
+ return False
+ common = posixpath.commonprefix((base[1], test[1]))
+ if len(common) == len(base[1]):
+ return True
+ return False
+
+
+class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
+
+ def find_user_password(self, realm, authuri):
+ user, password = HTTPPasswordMgr.find_user_password(self, realm,
+ authuri)
+ if user is not None:
+ return user, password
+ return HTTPPasswordMgr.find_user_password(self, None, authuri)
+
+
+class AbstractBasicAuthHandler:
+
+ rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
+
+ # XXX there can actually be multiple auth-schemes in a
+ # www-authenticate header. should probably be a lot more careful
+ # in parsing them to extract multiple alternatives
+
+ def __init__(self, password_mgr=None):
+ if password_mgr is None:
+ password_mgr = HTTPPasswordMgr()
+ self.passwd = password_mgr
+ self.add_password = self.passwd.add_password
+
+ def http_error_auth_reqed(self, authreq, host, req, headers):
+ # host may be an authority (without userinfo) or a URL with an
+ # authority
+ # XXX could be multiple headers
+ authreq = headers.get(authreq, None)
+ if authreq:
+ mo = AbstractBasicAuthHandler.rx.search(authreq)
+ if mo:
+ scheme, realm = mo.groups()
+ if scheme.lower() == 'basic':
+ return self.retry_http_basic_auth(host, req, realm)
+
+ def retry_http_basic_auth(self, host, req, realm):
+ user, pw = self.passwd.find_user_password(realm, host)
+ if pw is not None:
+ raw = "%s:%s" % (user, pw)
+ auth = 'Basic %s' % base64.encodestring(raw).strip()
+ if req.headers.get(self.auth_header, None) == auth:
+ return None
+ req.add_header(self.auth_header, auth)
+ return self.parent.open(req)
+ else:
+ return None
+
+
+class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
+
+ auth_header = 'Authorization'
+
+ def http_error_401(self, req, fp, code, msg, headers):
+ url = req.get_full_url()
+ return self.http_error_auth_reqed('www-authenticate',
+ url, req, headers)
+
+
+class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
+
+ auth_header = 'Proxy-authorization'
+
+ def http_error_407(self, req, fp, code, msg, headers):
+ # http_error_auth_reqed requires that there is no userinfo component in
+ # authority. Assume there isn't one, since urllib2 does not (and
+ # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
+ # userinfo.
+ authority = req.get_host()
+ return self.http_error_auth_reqed('proxy-authenticate',
+ authority, req, headers)
+
+
+def randombytes(n):
+ """Return n random bytes."""
+ # Use /dev/urandom if it is available. Fall back to random module
+ # if not. It might be worthwhile to extend this function to use
+ # other platform-specific mechanisms for getting random bytes.
+ if os.path.exists("/dev/urandom"):
+ f = open("/dev/urandom")
+ s = f.read(n)
+ f.close()
+ return s
+ else:
+ L = [chr(random.randrange(0, 256)) for i in range(n)]
+ return "".join(L)
+
+class AbstractDigestAuthHandler:
+ # Digest authentication is specified in RFC 2617.
+
+ # XXX The client does not inspect the Authentication-Info header
+ # in a successful response.
+
+ # XXX It should be possible to test this implementation against
+ # a mock server that just generates a static set of challenges.
+
+ # XXX qop="auth-int" supports is shaky
+
+ def __init__(self, passwd=None):
+ if passwd is None:
+ passwd = HTTPPasswordMgr()
+ self.passwd = passwd
+ self.add_password = self.passwd.add_password
+ self.retried = 0
+ self.nonce_count = 0
+
+ def reset_retry_count(self):
+ self.retried = 0
+
+ def http_error_auth_reqed(self, auth_header, host, req, headers):
+ authreq = headers.get(auth_header, None)
+ if self.retried > 5:
+ # Don't fail endlessly - if we failed once, we'll probably
+ # fail a second time. Hm. Unless the Password Manager is
+ # prompting for the information. Crap. This isn't great
+ # but it's better than the current 'repeat until recursion
+ # depth exceeded' approach <wink>
+ raise HTTPError(req.get_full_url(), 401, "digest auth failed",
+ headers, None)
+ else:
+ self.retried += 1
+ if authreq:
+ scheme = authreq.split()[0]
+ if scheme.lower() == 'digest':
+ return self.retry_http_digest_auth(req, authreq)
+
+ def retry_http_digest_auth(self, req, auth):
+ token, challenge = auth.split(' ', 1)
+ chal = parse_keqv_list(parse_http_list(challenge))
+ auth = self.get_authorization(req, chal)
+ if auth:
+ auth_val = 'Digest %s' % auth
+ if req.headers.get(self.auth_header, None) == auth_val:
+ return None
+ req.add_unredirected_header(self.auth_header, auth_val)
+ resp = self.parent.open(req)
+ return resp
+
+ def get_cnonce(self, nonce):
+ # The cnonce-value is an opaque
+ # quoted string value provided by the client and used by both client
+ # and server to avoid chosen plaintext attacks, to provide mutual
+ # authentication, and to provide some message integrity protection.
+ # This isn't a fabulous effort, but it's probably Good Enough.
+ dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
+ randombytes(8))).hexdigest()
+ return dig[:16]
+
+ def get_authorization(self, req, chal):
+ try:
+ realm = chal['realm']
+ nonce = chal['nonce']
+ qop = chal.get('qop')
+ algorithm = chal.get('algorithm', 'MD5')
+ # mod_digest doesn't send an opaque, even though it isn't
+ # supposed to be optional
+ opaque = chal.get('opaque', None)
+ except KeyError:
+ return None
+
+ H, KD = self.get_algorithm_impls(algorithm)
+ if H is None:
+ return None
+
+ user, pw = self.passwd.find_user_password(realm, req.get_full_url())
+ if user is None:
+ return None
+
+ # XXX not implemented yet
+ if req.has_data():
+ entdig = self.get_entity_digest(req.get_data(), chal)
+ else:
+ entdig = None
+
+ A1 = "%s:%s:%s" % (user, realm, pw)
+ A2 = "%s:%s" % (req.get_method(),
+ # XXX selector: what about proxies and full urls
+ req.get_selector())
+ if qop == 'auth':
+ self.nonce_count += 1
+ ncvalue = '%08x' % self.nonce_count
+ cnonce = self.get_cnonce(nonce)
+ noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
+ respdig = KD(H(A1), noncebit)
+ elif qop is None:
+ respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
+ else:
+ # XXX handle auth-int.
+ pass
+
+ # XXX should the partial digests be encoded too?
+
+ base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
+ 'response="%s"' % (user, realm, nonce, req.get_selector(),
+ respdig)
+ if opaque:
+ base += ', opaque="%s"' % opaque
+ if entdig:
+ base += ', digest="%s"' % entdig
+ base += ', algorithm="%s"' % algorithm
+ if qop:
+ base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
+ return base
+
+ def get_algorithm_impls(self, algorithm):
+ # lambdas assume digest modules are imported at the top level
+ if algorithm == 'MD5':
+ H = lambda x: md5.new(x).hexdigest()
+ elif algorithm == 'SHA':
+ H = lambda x: sha.new(x).hexdigest()
+ # XXX MD5-sess
+ KD = lambda s, d: H("%s:%s" % (s, d))
+ return H, KD
+
+ def get_entity_digest(self, data, chal):
+ # XXX not implemented yet
+ return None
+
+
+class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
+ """An authentication protocol defined by RFC 2069
+
+ Digest authentication improves on basic authentication because it
+ does not transmit passwords in the clear.
+ """
+
+ auth_header = 'Authorization'
+
+ def http_error_401(self, req, fp, code, msg, headers):
+ host = urlparse.urlparse(req.get_full_url())[1]
+ retry = self.http_error_auth_reqed('www-authenticate',
+ host, req, headers)
+ self.reset_retry_count()
+ return retry
+
+
+class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
+
+ auth_header = 'Proxy-Authorization'
+
+ def http_error_407(self, req, fp, code, msg, headers):
+ host = req.get_host()
+ retry = self.http_error_auth_reqed('proxy-authenticate',
+ host, req, headers)
+ self.reset_retry_count()
+ return retry
+
+
+
+class HTTPProxyPasswordMgr(HTTPPasswordMgr):
+ # has default realm and host/port
+ def add_password(self, realm, uri, user, passwd):
+ # uri could be a single URI or a sequence
+ if uri is None or isinstance(uri, basestring):
+ uris = [uri]
+ else:
+ uris = uri
+ passwd_by_domain = self.passwd.setdefault(realm, {})
+ for uri in uris:
+ uri = self.reduce_uri(uri)
+ passwd_by_domain[uri] = (user, passwd)
+
+ def find_user_password(self, realm, authuri):
+ perms = [(realm, authuri), (None, authuri)]
+ # bleh, want default realm to take precedence over default
+ # URI/authority, hence this outer loop
+ for default_uri in False, True:
+ for realm, authuri in perms:
+ authinfo_by_domain = self.passwd.get(realm, {})
+ reduced_authuri = self.reduce_uri(authuri)
+ for uri, authinfo in authinfo_by_domain.iteritems():
+ if uri is None and not default_uri:
+ continue
+ if self.is_suburi(uri, reduced_authuri):
+ return authinfo
+ user, password = None, None
+
+ if user is not None:
+ break
+ return user, password
+
+ def reduce_uri(self, uri):
+ if uri is None:
+ return None
+ return HTTPPasswordMgr.reduce_uri(self, uri)
+
+ def is_suburi(self, base, test):
+ if base is None:
+ # default to the proxy's host/port
+ hostport, path = test
+ base = (hostport, "/")
+ return HTTPPasswordMgr.is_suburi(self, base, test)
Property changes on: Zope3/trunk/src/mechanize/_auth.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_clientcookie.py
===================================================================
--- Zope3/trunk/src/mechanize/_clientcookie.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_clientcookie.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,1656 @@
+"""HTTP cookie handling for web clients, plus some other stuff.
+
+This module originally developed from my port of Gisle Aas' Perl module
+HTTP::Cookies, from the libwww-perl library.
+
+Docstrings, comments and debug strings in this code refer to the
+attributes of the HTTP cookie system as cookie-attributes, to distinguish
+them clearly from Python attributes.
+
+ CookieJar____
+ / \ \
+ FileCookieJar \ \
+ / | \ \ \
+ MozillaCookieJar | LWPCookieJar \ \
+ | | \
+ | ---MSIEBase | \
+ | / | | \
+ | / MSIEDBCookieJar BSDDBCookieJar
+ |/
+ MSIECookieJar
+
+Comments to John J Lee <jjl at pobox.com>.
+
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+Copyright 2002-2003 Johnny Lee (original MSIE Perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import sys, re, urlparse, string, copy, time, struct, urllib, types, logging
+try:
+ import threading
+ _threading = threading; del threading
+except ImportError:
+ import dummy_threading
+ _threading = dummy_threading; del dummy_threading
+import httplib # only for the default HTTP port
+
+MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
+ "instance initialised with one)")
+DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
+
+from _headersutil import split_header_words, parse_ns_headers
+from _util import startswith, endswith, isstringlike, getheaders
+
+debug = logging.getLogger("mechanize.cookies").debug
+
+
+def reraise_unmasked_exceptions(unmasked=()):
+ # There are a few catch-all except: statements in this module, for
+ # catching input that's bad in unexpected ways.
+ # This function re-raises some exceptions we don't want to trap.
+ import mechanize, warnings
+ if not mechanize.USE_BARE_EXCEPT:
+ raise
+ unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
+ etype = sys.exc_info()[0]
+ if issubclass(etype, unmasked):
+ raise
+ # swallowed an exception
+ import traceback, StringIO
+ f = StringIO.StringIO()
+ traceback.print_exc(None, f)
+ msg = f.getvalue()
+ warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2)
+
+
+IPV4_RE = re.compile(r"\.\d+$")
+def is_HDN(text):
+ """Return True if text is a host domain name."""
+ # XXX
+ # This may well be wrong. Which RFC is HDN defined in, if any (for
+ # the purposes of RFC 2965)?
+ # For the current implementation, what about IPv6? Remember to look
+ # at other uses of IPV4_RE also, if change this.
+ return not (IPV4_RE.search(text) or
+ text == "" or
+ text[0] == "." or text[-1] == ".")
+
+def domain_match(A, B):
+ """Return True if domain A domain-matches domain B, according to RFC 2965.
+
+ A and B may be host domain names or IP addresses.
+
+ RFC 2965, section 1:
+
+ Host names can be specified either as an IP address or a HDN string.
+ Sometimes we compare one host name with another. (Such comparisons SHALL
+ be case-insensitive.) Host A's name domain-matches host B's if
+
+ * their host name strings string-compare equal; or
+
+ * A is a HDN string and has the form NB, where N is a non-empty
+ name string, B has the form .B', and B' is a HDN string. (So,
+ x.y.com domain-matches .Y.com but not Y.com.)
+
+ Note that domain-match is not a commutative operation: a.b.c.com
+ domain-matches .c.com, but not the reverse.
+
+ """
+ # Note that, if A or B are IP addresses, the only relevant part of the
+ # definition of the domain-match algorithm is the direct string-compare.
+ A = string.lower(A)
+ B = string.lower(B)
+ if A == B:
+ return True
+ if not is_HDN(A):
+ return False
+ i = string.rfind(A, B)
+ has_form_nb = not (i == -1 or i == 0)
+ return (
+ has_form_nb and
+ startswith(B, ".") and
+ is_HDN(B[1:])
+ )
+
+def liberal_is_HDN(text):
+ """Return True if text is a sort-of-like a host domain name.
+
+ For accepting/blocking domains.
+
+ """
+ return not IPV4_RE.search(text)
+
+def user_domain_match(A, B):
+ """For blocking/accepting domains.
+
+ A and B may be host domain names or IP addresses.
+
+ """
+ A = string.lower(A)
+ B = string.lower(B)
+ if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
+ if A == B:
+ # equal IP addresses
+ return True
+ return False
+ initial_dot = startswith(B, ".")
+ if initial_dot and endswith(A, B):
+ return True
+ if not initial_dot and A == B:
+ return True
+ return False
+
+cut_port_re = re.compile(r":\d+$")
+def request_host(request):
+ """Return request-host, as defined by RFC 2965.
+
+ Variation from RFC: returned value is lowercased, for convenient
+ comparison.
+
+ """
+ url = request.get_full_url()
+ host = urlparse.urlparse(url)[1]
+ if host == "":
+ host = request.get_header("Host", "")
+
+ # remove port, if present
+ host = cut_port_re.sub("", host, 1)
+ return string.lower(host)
+
+def eff_request_host(request):
+ """Return a tuple (request-host, effective request-host name).
+
+ As defined by RFC 2965, except both are lowercased.
+
+ """
+ erhn = req_host = request_host(request)
+ if string.find(req_host, ".") == -1 and not IPV4_RE.search(req_host):
+ erhn = req_host + ".local"
+ return req_host, erhn
+
+def request_path(request):
+ """request-URI, as defined by RFC 2965."""
+ url = request.get_full_url()
+ #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
+ #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
+ path, parameters, query, frag = urlparse.urlparse(url)[2:]
+ if parameters:
+ path = "%s;%s" % (path, parameters)
+ path = escape_path(path)
+ req_path = urlparse.urlunparse(("", "", path, "", query, frag))
+ if not startswith(req_path, "/"):
+ # fix bad RFC 2396 absoluteURI
+ req_path = "/"+req_path
+ return req_path
+
+def request_port(request):
+ host = request.get_host()
+ i = string.find(host, ':')
+ if i >= 0:
+ port = host[i+1:]
+ try:
+ int(port)
+ except ValueError:
+ debug("nonnumeric port: '%s'", port)
+ return None
+ else:
+ port = DEFAULT_HTTP_PORT
+ return port
+
+# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
+# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
+HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
+ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
+def uppercase_escaped_char(match):
+ return "%%%s" % string.upper(match.group(1))
+def escape_path(path):
+ """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
+ # There's no knowing what character encoding was used to create URLs
+ # containing %-escapes, but since we have to pick one to escape invalid
+ # path characters, we pick UTF-8, as recommended in the HTML 4.0
+ # specification:
+ # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
+ # And here, kind of: draft-fielding-uri-rfc2396bis-03
+ # (And in draft IRI specification: draft-duerst-iri-05)
+ # (And here, for new URI schemes: RFC 2718)
+ if isinstance(path, types.UnicodeType):
+ path = path.encode("utf-8")
+ path = urllib.quote(path, HTTP_PATH_SAFE)
+ path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
+ return path
+
+def reach(h):
+ """Return reach of host h, as defined by RFC 2965, section 1.
+
+ The reach R of a host name H is defined as follows:
+
+ * If
+
+ - H is the host domain name of a host; and,
+
+ - H has the form A.B; and
+
+ - A has no embedded (that is, interior) dots; and
+
+ - B has at least one embedded dot, or B is the string "local".
+ then the reach of H is .B.
+
+ * Otherwise, the reach of H is H.
+
+ >>> reach("www.acme.com")
+ '.acme.com'
+ >>> reach("acme.com")
+ 'acme.com'
+ >>> reach("acme.local")
+ '.local'
+
+ """
+ i = string.find(h, ".")
+ if i >= 0:
+ #a = h[:i] # this line is only here to show what a is
+ b = h[i+1:]
+ i = string.find(b, ".")
+ if is_HDN(h) and (i >= 0 or b == "local"):
+ return "."+b
+ return h
+
+def is_third_party(request):
+ """
+
+ RFC 2965, section 3.3.6:
+
+ An unverifiable transaction is to a third-party host if its request-
+ host U does not domain-match the reach R of the request-host O in the
+ origin transaction.
+
+ """
+ req_host = request_host(request)
+ # the origin request's request-host was stuffed into request by
+ # _urllib2_support.AbstractHTTPHandler
+ return not domain_match(req_host, reach(request.origin_req_host))
+
+
+class Cookie:
+ """HTTP Cookie.
+
+ This class represents both Netscape and RFC 2965 cookies.
+
+ This is deliberately a very simple class. It just holds attributes. It's
+ possible to construct Cookie instances that don't comply with the cookie
+ standards. CookieJar.make_cookies is the factory function for Cookie
+ objects -- it deals with cookie parsing, supplying defaults, and
+ normalising to the representation used in this class. CookiePolicy is
+ responsible for checking them to see whether they should be accepted from
+ and returned to the server.
+
+ version: integer;
+ name: string;
+ value: string (may be None);
+ port: string; None indicates no attribute was supplied (eg. "Port", rather
+ than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list
+ string (eg. "80,8080")
+ port_specified: boolean; true if a value was supplied with the Port
+ cookie-attribute
+ domain: string;
+ domain_specified: boolean; true if Domain was explicitly set
+ domain_initial_dot: boolean; true if Domain as set in HTTP header by server
+ started with a dot (yes, this really is necessary!)
+ path: string;
+ path_specified: boolean; true if Path was explicitly set
+ secure: boolean; true if should only be returned over secure connection
+ expires: integer; seconds since epoch (RFC 2965 cookies should calculate
+ this value from the Max-Age attribute)
+ discard: boolean, true if this is a session cookie; (if no expires value,
+ this should be true)
+ comment: string;
+ comment_url: string;
+ rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not
+ Set-Cookie2:) header, but had a version cookie-attribute of 1
+ rest: mapping of other cookie-attributes
+
+ Note that the port may be present in the headers, but unspecified ("Port"
+ rather than"Port=80", for example); if this is the case, port is None.
+
+ """
+
+ def __init__(self, version, name, value,
+ port, port_specified,
+ domain, domain_specified, domain_initial_dot,
+ path, path_specified,
+ secure,
+ expires,
+ discard,
+ comment,
+ comment_url,
+ rest,
+ rfc2109=False,
+ ):
+
+ if version is not None: version = int(version)
+ if expires is not None: expires = int(expires)
+ if port is None and port_specified is True:
+ raise ValueError("if port is None, port_specified must be false")
+
+ self.version = version
+ self.name = name
+ self.value = value
+ self.port = port
+ self.port_specified = port_specified
+ # normalise case, as per RFC 2965 section 3.3.3
+ self.domain = string.lower(domain)
+ self.domain_specified = domain_specified
+ # Sigh. We need to know whether the domain given in the
+ # cookie-attribute had an initial dot, in order to follow RFC 2965
+ # (as clarified in draft errata). Needed for the returned $Domain
+ # value.
+ self.domain_initial_dot = domain_initial_dot
+ self.path = path
+ self.path_specified = path_specified
+ self.secure = secure
+ self.expires = expires
+ self.discard = discard
+ self.comment = comment
+ self.comment_url = comment_url
+ self.rfc2109 = rfc2109
+
+ self._rest = copy.copy(rest)
+
+ def has_nonstandard_attr(self, name):
+ return self._rest.has_key(name)
+ def get_nonstandard_attr(self, name, default=None):
+ return self._rest.get(name, default)
+ def set_nonstandard_attr(self, name, value):
+ self._rest[name] = value
+ def nonstandard_attr_keys(self):
+ return self._rest.keys()
+
+ def is_expired(self, now=None):
+ if now is None: now = time.time()
+ return (self.expires is not None) and (self.expires <= now)
+
+ def __str__(self):
+ if self.port is None: p = ""
+ else: p = ":"+self.port
+ limit = self.domain + p + self.path
+ if self.value is not None:
+ namevalue = "%s=%s" % (self.name, self.value)
+ else:
+ namevalue = self.name
+ return "<Cookie %s for %s>" % (namevalue, limit)
+
+ def __repr__(self):
+ args = []
+ for name in ["version", "name", "value",
+ "port", "port_specified",
+ "domain", "domain_specified", "domain_initial_dot",
+ "path", "path_specified",
+ "secure", "expires", "discard", "comment", "comment_url",
+ ]:
+ attr = getattr(self, name)
+ args.append("%s=%s" % (name, repr(attr)))
+ args.append("rest=%s" % repr(self._rest))
+ args.append("rfc2109=%s" % repr(self.rfc2109))
+ return "Cookie(%s)" % string.join(args, ", ")
+
+
+class CookiePolicy:
+ """Defines which cookies get accepted from and returned to server.
+
+ May also modify cookies.
+
+ The subclass DefaultCookiePolicy defines the standard rules for Netscape
+ and RFC 2965 cookies -- override that if you want a customised policy.
+
+ As well as implementing set_ok and return_ok, implementations of this
+ interface must also supply the following attributes, indicating which
+ protocols should be used, and how. These can be read and set at any time,
+ though whether that makes complete sense from the protocol point of view is
+ doubtful.
+
+ Public attributes:
+
+ netscape: implement netscape protocol
+ rfc2965: implement RFC 2965 protocol
+ rfc2109_as_netscape:
+ WARNING: This argument will change or go away if is not accepted into
+ the Python standard library in this form!
+ If true, treat RFC 2109 cookies as though they were Netscape cookies. The
+ default is for this attribute to be None, which means treat 2109 cookies
+ as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is,
+ by default), and as Netscape cookies otherwise.
+ hide_cookie2: don't add Cookie2 header to requests (the presence of
+ this header indicates to the server that we understand RFC 2965
+ cookies)
+
+ """
+ def set_ok(self, cookie, request):
+ """Return true if (and only if) cookie should be accepted from server.
+
+ Currently, pre-expired cookies never get this far -- the CookieJar
+ class deletes such cookies itself.
+
+ cookie: mechanize.Cookie object
+ request: object implementing the interface defined by
+ CookieJar.extract_cookies.__doc__
+
+ """
+ raise NotImplementedError()
+
+ def return_ok(self, cookie, request):
+ """Return true if (and only if) cookie should be returned to server.
+
+ cookie: mechanize.Cookie object
+ request: object implementing the interface defined by
+ CookieJar.add_cookie_header.__doc__
+
+ """
+ raise NotImplementedError()
+
+ def domain_return_ok(self, domain, request):
+ """Return false if cookies should not be returned, given cookie domain.
+
+ This is here as an optimization, to remove the need for checking every
+ cookie with a particular domain (which may involve reading many files).
+ The default implementations of domain_return_ok and path_return_ok
+ (return True) leave all the work to return_ok.
+
+ If domain_return_ok returns true for the cookie domain, path_return_ok
+ is called for the cookie path. Otherwise, path_return_ok and return_ok
+ are never called for that cookie domain. If path_return_ok returns
+ true, return_ok is called with the Cookie object itself for a full
+ check. Otherwise, return_ok is never called for that cookie path.
+
+ Note that domain_return_ok is called for every *cookie* domain, not
+ just for the *request* domain. For example, the function might be
+ called with both ".acme.com" and "www.acme.com" if the request domain is
+ "www.acme.com". The same goes for path_return_ok.
+
+ For argument documentation, see the docstring for return_ok.
+
+ """
+ return True
+
+ def path_return_ok(self, path, request):
+ """Return false if cookies should not be returned, given cookie path.
+
+ See the docstring for domain_return_ok.
+
+ """
+ return True
+
+
+class DefaultCookiePolicy(CookiePolicy):
+ """Implements the standard rules for accepting and returning cookies.
+
+ Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is
+ switched off by default.
+
+ The easiest way to provide your own policy is to override this class and
+ call its methods in your overriden implementations before adding your own
+ additional checks.
+
+ import mechanize
+ class MyCookiePolicy(mechanize.DefaultCookiePolicy):
+ def set_ok(self, cookie, request):
+ if not mechanize.DefaultCookiePolicy.set_ok(
+ self, cookie, request):
+ return False
+ if i_dont_want_to_store_this_cookie():
+ return False
+ return True
+
+ In addition to the features required to implement the CookiePolicy
+ interface, this class allows you to block and allow domains from setting
+ and receiving cookies. There are also some strictness switches that allow
+ you to tighten up the rather loose Netscape protocol rules a little bit (at
+ the cost of blocking some benign cookies).
+
+ A domain blacklist and whitelist is provided (both off by default). Only
+ domains not in the blacklist and present in the whitelist (if the whitelist
+ is active) participate in cookie setting and returning. Use the
+ blocked_domains constructor argument, and blocked_domains and
+ set_blocked_domains methods (and the corresponding argument and methods for
+ allowed_domains). If you set a whitelist, you can turn it off again by
+ setting it to None.
+
+ Domains in block or allow lists that do not start with a dot must
+ string-compare equal. For example, "acme.com" matches a blacklist entry of
+ "acme.com", but "www.acme.com" does not. Domains that do start with a dot
+ are matched by more specific domains too. For example, both "www.acme.com"
+ and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does
+ not). IP addresses are an exception, and must match exactly. For example,
+ if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is
+ blocked, but 193.168.1.2 is not.
+
+ Additional Public Attributes:
+
+ General strictness switches
+
+ strict_domain: don't allow sites to set two-component domains with
+ country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc.
+ This is far from perfect and isn't guaranteed to work!
+
+ RFC 2965 protocol strictness switches
+
+ strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable
+ transactions (usually, an unverifiable transaction is one resulting from
+ a redirect or an image hosted on another site); if this is false, cookies
+ are NEVER blocked on the basis of verifiability
+
+ Netscape protocol strictness switches
+
+ strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions
+ even to Netscape cookies
+ strict_ns_domain: flags indicating how strict to be with domain-matching
+ rules for Netscape cookies:
+ DomainStrictNoDots: when setting cookies, host prefix must not contain a
+ dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because
+ www.foo contains a dot)
+ DomainStrictNonDomain: cookies that did not explicitly specify a Domain
+ cookie-attribute can only be returned to a domain that string-compares
+ equal to the domain that set the cookie (eg. rockets.acme.com won't
+ be returned cookies from acme.com that had no Domain cookie-attribute)
+ DomainRFC2965Match: when setting cookies, require a full RFC 2965
+ domain-match
+ DomainLiberal and DomainStrict are the most useful combinations of the
+ above flags, for convenience
+ strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that
+ have names starting with '$'
+ strict_ns_set_path: don't allow setting cookies whose path doesn't
+ path-match request URI
+
+ """
+
+ DomainStrictNoDots = 1
+ DomainStrictNonDomain = 2
+ DomainRFC2965Match = 4
+
+ DomainLiberal = 0
+ DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
+
+ def __init__(self,
+ blocked_domains=None, allowed_domains=None,
+ netscape=True, rfc2965=False,
+ # WARNING: this argument will change or go away if is not
+ # accepted into the Python standard library in this form!
+ # default, ie. treat 2109 as netscape iff not rfc2965
+ rfc2109_as_netscape=None,
+ hide_cookie2=False,
+ strict_domain=False,
+ strict_rfc2965_unverifiable=True,
+ strict_ns_unverifiable=False,
+ strict_ns_domain=DomainLiberal,
+ strict_ns_set_initial_dollar=False,
+ strict_ns_set_path=False,
+ ):
+ """
+ Constructor arguments should be used as keyword arguments only.
+
+ blocked_domains: sequence of domain names that we never accept cookies
+ from, nor return cookies to
+ allowed_domains: if not None, this is a sequence of the only domains
+ for which we accept and return cookies
+
+ For other arguments, see CookiePolicy.__doc__ and
+ DefaultCookiePolicy.__doc__..
+
+ """
+ self.netscape = netscape
+ self.rfc2965 = rfc2965
+ self.rfc2109_as_netscape = rfc2109_as_netscape
+ self.hide_cookie2 = hide_cookie2
+ self.strict_domain = strict_domain
+ self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
+ self.strict_ns_unverifiable = strict_ns_unverifiable
+ self.strict_ns_domain = strict_ns_domain
+ self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
+ self.strict_ns_set_path = strict_ns_set_path
+
+ if blocked_domains is not None:
+ self._blocked_domains = tuple(blocked_domains)
+ else:
+ self._blocked_domains = ()
+
+ if allowed_domains is not None:
+ allowed_domains = tuple(allowed_domains)
+ self._allowed_domains = allowed_domains
+
+ def blocked_domains(self):
+ """Return the sequence of blocked domains (as a tuple)."""
+ return self._blocked_domains
+ def set_blocked_domains(self, blocked_domains):
+ """Set the sequence of blocked domains."""
+ self._blocked_domains = tuple(blocked_domains)
+
+ def is_blocked(self, domain):
+ for blocked_domain in self._blocked_domains:
+ if user_domain_match(domain, blocked_domain):
+ return True
+ return False
+
+ def allowed_domains(self):
+ """Return None, or the sequence of allowed domains (as a tuple)."""
+ return self._allowed_domains
+ def set_allowed_domains(self, allowed_domains):
+ """Set the sequence of allowed domains, or None."""
+ if allowed_domains is not None:
+ allowed_domains = tuple(allowed_domains)
+ self._allowed_domains = allowed_domains
+
+ def is_not_allowed(self, domain):
+ if self._allowed_domains is None:
+ return False
+ for allowed_domain in self._allowed_domains:
+ if user_domain_match(domain, allowed_domain):
+ return False
+ return True
+
+ def set_ok(self, cookie, request):
+ """
+ If you override set_ok, be sure to call this method. If it returns
+ false, so should your subclass (assuming your subclass wants to be more
+ strict about which cookies to accept).
+
+ """
+ debug(" - checking cookie %s", cookie)
+
+ assert cookie.name is not None
+
+ for n in "version", "verifiability", "name", "path", "domain", "port":
+ fn_name = "set_ok_"+n
+ fn = getattr(self, fn_name)
+ if not fn(cookie, request):
+ return False
+
+ return True
+
+ def set_ok_version(self, cookie, request):
+ if cookie.version is None:
+ # Version is always set to 0 by parse_ns_headers if it's a Netscape
+ # cookie, so this must be an invalid RFC 2965 cookie.
+ debug(" Set-Cookie2 without version attribute (%s)", cookie)
+ return False
+ if cookie.version > 0 and not self.rfc2965:
+ debug(" RFC 2965 cookies are switched off")
+ return False
+ elif cookie.version == 0 and not self.netscape:
+ debug(" Netscape cookies are switched off")
+ return False
+ return True
+
+ def set_ok_verifiability(self, cookie, request):
+ if request.unverifiable and is_third_party(request):
+ if cookie.version > 0 and self.strict_rfc2965_unverifiable:
+ debug(" third-party RFC 2965 cookie during "
+ "unverifiable transaction")
+ return False
+ elif cookie.version == 0 and self.strict_ns_unverifiable:
+ debug(" third-party Netscape cookie during "
+ "unverifiable transaction")
+ return False
+ return True
+
+ def set_ok_name(self, cookie, request):
+ # Try and stop servers setting V0 cookies designed to hack other
+ # servers that know both V0 and V1 protocols.
+ if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
+ startswith(cookie.name, "$")):
+ debug(" illegal name (starts with '$'): '%s'", cookie.name)
+ return False
+ return True
+
+ def set_ok_path(self, cookie, request):
+ if cookie.path_specified:
+ req_path = request_path(request)
+ if ((cookie.version > 0 or
+ (cookie.version == 0 and self.strict_ns_set_path)) and
+ not startswith(req_path, cookie.path)):
+ debug(" path attribute %s is not a prefix of request "
+ "path %s", cookie.path, req_path)
+ return False
+ return True
+
+ def set_ok_countrycode_domain(self, cookie, request):
+ """Return False if explicit cookie domain is not acceptable.
+
+ Called by set_ok_domain, for convenience of overriding by
+ subclasses.
+
+ """
+ if cookie.domain_specified and self.strict_domain:
+ domain = cookie.domain
+ # since domain was specified, we know that:
+ assert domain.startswith(".")
+ if string.count(domain, ".") == 2:
+ # domain like .foo.bar
+ i = string.rfind(domain, ".")
+ tld = domain[i+1:]
+ sld = domain[1:i]
+ if (string.lower(sld) in [
+ "co", "ac",
+ "com", "edu", "org", "net", "gov", "mil", "int",
+ "aero", "biz", "cat", "coop", "info", "jobs", "mobi",
+ "museum", "name", "pro", "travel",
+ ] and
+ len(tld) == 2):
+ # domain like .co.uk
+ return False
+ return True
+
+ def set_ok_domain(self, cookie, request):
+ if self.is_blocked(cookie.domain):
+ debug(" domain %s is in user block-list", cookie.domain)
+ return False
+ if self.is_not_allowed(cookie.domain):
+ debug(" domain %s is not in user allow-list", cookie.domain)
+ return False
+ if not self.set_ok_countrycode_domain(cookie, request):
+ debug(" country-code second level domain %s", cookie.domain)
+ return False
+ if cookie.domain_specified:
+ req_host, erhn = eff_request_host(request)
+ domain = cookie.domain
+ if startswith(domain, "."):
+ undotted_domain = domain[1:]
+ else:
+ undotted_domain = domain
+ embedded_dots = (string.find(undotted_domain, ".") >= 0)
+ if not embedded_dots and domain != ".local":
+ debug(" non-local domain %s contains no embedded dot",
+ domain)
+ return False
+ if cookie.version == 0:
+ if (not endswith(erhn, domain) and
+ (not startswith(erhn, ".") and
+ not endswith("."+erhn, domain))):
+ debug(" effective request-host %s (even with added "
+ "initial dot) does not end end with %s",
+ erhn, domain)
+ return False
+ if (cookie.version > 0 or
+ (self.strict_ns_domain & self.DomainRFC2965Match)):
+ if not domain_match(erhn, domain):
+ debug(" effective request-host %s does not domain-match "
+ "%s", erhn, domain)
+ return False
+ if (cookie.version > 0 or
+ (self.strict_ns_domain & self.DomainStrictNoDots)):
+ host_prefix = req_host[:-len(domain)]
+ if (string.find(host_prefix, ".") >= 0 and
+ not IPV4_RE.search(req_host)):
+ debug(" host prefix %s for domain %s contains a dot",
+ host_prefix, domain)
+ return False
+ return True
+
+ def set_ok_port(self, cookie, request):
+ if cookie.port_specified:
+ req_port = request_port(request)
+ if req_port is None:
+ req_port = "80"
+ else:
+ req_port = str(req_port)
+ for p in string.split(cookie.port, ","):
+ try:
+ int(p)
+ except ValueError:
+ debug(" bad port %s (not numeric)", p)
+ return False
+ if p == req_port:
+ break
+ else:
+ debug(" request port (%s) not found in %s",
+ req_port, cookie.port)
+ return False
+ return True
+
+ def return_ok(self, cookie, request):
+ """
+ If you override return_ok, be sure to call this method. If it returns
+ false, so should your subclass (assuming your subclass wants to be more
+ strict about which cookies to return).
+
+ """
+ # Path has already been checked by path_return_ok, and domain blocking
+ # done by domain_return_ok.
+ debug(" - checking cookie %s", cookie)
+
+ for n in "version", "verifiability", "secure", "expires", "port", "domain":
+ fn_name = "return_ok_"+n
+ fn = getattr(self, fn_name)
+ if not fn(cookie, request):
+ return False
+ return True
+
+ def return_ok_version(self, cookie, request):
+ if cookie.version > 0 and not self.rfc2965:
+ debug(" RFC 2965 cookies are switched off")
+ return False
+ elif cookie.version == 0 and not self.netscape:
+ debug(" Netscape cookies are switched off")
+ return False
+ return True
+
+ def return_ok_verifiability(self, cookie, request):
+ if request.unverifiable and is_third_party(request):
+ if cookie.version > 0 and self.strict_rfc2965_unverifiable:
+ debug(" third-party RFC 2965 cookie during unverifiable "
+ "transaction")
+ return False
+ elif cookie.version == 0 and self.strict_ns_unverifiable:
+ debug(" third-party Netscape cookie during unverifiable "
+ "transaction")
+ return False
+ return True
+
+ def return_ok_secure(self, cookie, request):
+ if cookie.secure and request.get_type() != "https":
+ debug(" secure cookie with non-secure request")
+ return False
+ return True
+
+ def return_ok_expires(self, cookie, request):
+ if cookie.is_expired(self._now):
+ debug(" cookie expired")
+ return False
+ return True
+
+ def return_ok_port(self, cookie, request):
+ if cookie.port:
+ req_port = request_port(request)
+ if req_port is None:
+ req_port = "80"
+ for p in string.split(cookie.port, ","):
+ if p == req_port:
+ break
+ else:
+ debug(" request port %s does not match cookie port %s",
+ req_port, cookie.port)
+ return False
+ return True
+
+ def return_ok_domain(self, cookie, request):
+ req_host, erhn = eff_request_host(request)
+ domain = cookie.domain
+
+ # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
+ if (cookie.version == 0 and
+ (self.strict_ns_domain & self.DomainStrictNonDomain) and
+ not cookie.domain_specified and domain != erhn):
+ debug(" cookie with unspecified domain does not string-compare "
+ "equal to request domain")
+ return False
+
+ if cookie.version > 0 and not domain_match(erhn, domain):
+ debug(" effective request-host name %s does not domain-match "
+ "RFC 2965 cookie domain %s", erhn, domain)
+ return False
+ if cookie.version == 0 and not endswith("."+erhn, domain):
+ debug(" request-host %s does not match Netscape cookie domain "
+ "%s", req_host, domain)
+ return False
+ return True
+
+ def domain_return_ok(self, domain, request):
+ # Liberal check of domain. This is here as an optimization to avoid
+ # having to load lots of MSIE cookie files unless necessary.
+
+ # Munge req_host and erhn to always start with a dot, so as to err on
+ # the side of letting cookies through.
+ dotted_req_host, dotted_erhn = eff_request_host(request)
+ if not startswith(dotted_req_host, "."):
+ dotted_req_host = "."+dotted_req_host
+ if not startswith(dotted_erhn, "."):
+ dotted_erhn = "."+dotted_erhn
+ if not (endswith(dotted_req_host, domain) or
+ endswith(dotted_erhn, domain)):
+ #debug(" request domain %s does not match cookie domain %s",
+ # req_host, domain)
+ return False
+
+ if self.is_blocked(domain):
+ debug(" domain %s is in user block-list", domain)
+ return False
+ if self.is_not_allowed(domain):
+ debug(" domain %s is not in user allow-list", domain)
+ return False
+
+ return True
+
+ def path_return_ok(self, path, request):
+ debug("- checking cookie path=%s", path)
+ req_path = request_path(request)
+ if not startswith(req_path, path):
+ debug(" %s does not path-match %s", req_path, path)
+ return False
+ return True
+
+
+def vals_sorted_by_key(adict):
+ keys = adict.keys()
+ keys.sort()
+ return map(adict.get, keys)
+
+class MappingIterator:
+ """Iterates over nested mapping, depth-first, in sorted order by key."""
+ def __init__(self, mapping):
+ self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack
+
+ def __iter__(self): return self
+
+ def next(self):
+ # this is hairy because of lack of generators
+ while 1:
+ try:
+ vals, i, prev_item = self._s.pop()
+ except IndexError:
+ raise StopIteration()
+ if i < len(vals):
+ item = vals[i]
+ i = i + 1
+ self._s.append((vals, i, prev_item))
+ try:
+ item.items
+ except AttributeError:
+ # non-mapping
+ break
+ else:
+ # mapping
+ self._s.append((vals_sorted_by_key(item), 0, item))
+ continue
+ return item
+
+
+# Used as second parameter to dict.get method, to distinguish absent
+# dict key from one with a None value.
+class Absent: pass
+
+class CookieJar:
+ """Collection of HTTP cookies.
+
+ You may not need to know about this class: try mechanize.urlopen().
+
+ The major methods are extract_cookies and add_cookie_header; these are all
+ you are likely to need.
+
+ CookieJar supports the iterator protocol:
+
+ for cookie in cookiejar:
+ # do something with cookie
+
+ Methods:
+
+ add_cookie_header(request)
+ extract_cookies(response, request)
+ make_cookies(response, request)
+ set_cookie_if_ok(cookie, request)
+ set_cookie(cookie)
+ clear_session_cookies()
+ clear_expired_cookies()
+ clear(domain=None, path=None, name=None)
+
+ Public attributes
+
+ policy: CookiePolicy object
+
+ """
+
+ non_word_re = re.compile(r"\W")
+ quote_re = re.compile(r"([\"\\])")
+ strict_domain_re = re.compile(r"\.?[^.]*")
+ domain_re = re.compile(r"[^.]*")
+ dots_re = re.compile(r"^\.+")
+
+ def __init__(self, policy=None):
+ """
+ See CookieJar.__doc__ for argument documentation.
+
+ """
+ if policy is None:
+ policy = DefaultCookiePolicy()
+ self._policy = policy
+
+ self._cookies = {}
+
+ # for __getitem__ iteration in pre-2.2 Pythons
+ self._prev_getitem_index = 0
+
+ def set_policy(self, policy):
+ self._policy = policy
+
+ def _cookies_for_domain(self, domain, request):
+ cookies = []
+ if not self._policy.domain_return_ok(domain, request):
+ return []
+ debug("Checking %s for cookies to return", domain)
+ cookies_by_path = self._cookies[domain]
+ for path in cookies_by_path.keys():
+ if not self._policy.path_return_ok(path, request):
+ continue
+ cookies_by_name = cookies_by_path[path]
+ for cookie in cookies_by_name.values():
+ if not self._policy.return_ok(cookie, request):
+ debug(" not returning cookie")
+ continue
+ debug(" it's a match")
+ cookies.append(cookie)
+ return cookies
+
+ def _cookies_for_request(self, request):
+ """Return a list of cookies to be returned to server."""
+ cookies = []
+ for domain in self._cookies.keys():
+ cookies.extend(self._cookies_for_domain(domain, request))
+ return cookies
+
+ def _cookie_attrs(self, cookies):
+ """Return a list of cookie-attributes to be returned to server.
+
+ like ['foo="bar"; $Path="/"', ...]
+
+ The $Version attribute is also added when appropriate (currently only
+ once per request).
+
+ """
+ # add cookies in order of most specific (ie. longest) path first
+ def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
+ cookies.sort(decreasing_size)
+
+ version_set = False
+
+ attrs = []
+ for cookie in cookies:
+ # set version of Cookie header
+ # XXX
+ # What should it be if multiple matching Set-Cookie headers have
+ # different versions themselves?
+ # Answer: there is no answer; was supposed to be settled by
+ # RFC 2965 errata, but that may never appear...
+ version = cookie.version
+ if not version_set:
+ version_set = True
+ if version > 0:
+ attrs.append("$Version=%s" % version)
+
+ # quote cookie value if necessary
+ # (not for Netscape protocol, which already has any quotes
+ # intact, due to the poorly-specified Netscape Cookie: syntax)
+ if ((cookie.value is not None) and
+ self.non_word_re.search(cookie.value) and version > 0):
+ value = self.quote_re.sub(r"\\\1", cookie.value)
+ else:
+ value = cookie.value
+
+ # add cookie-attributes to be returned in Cookie header
+ if cookie.value is None:
+ attrs.append(cookie.name)
+ else:
+ attrs.append("%s=%s" % (cookie.name, value))
+ if version > 0:
+ if cookie.path_specified:
+ attrs.append('$Path="%s"' % cookie.path)
+ if startswith(cookie.domain, "."):
+ domain = cookie.domain
+ if (not cookie.domain_initial_dot and
+ startswith(domain, ".")):
+ domain = domain[1:]
+ attrs.append('$Domain="%s"' % domain)
+ if cookie.port is not None:
+ p = "$Port"
+ if cookie.port_specified:
+ p = p + ('="%s"' % cookie.port)
+ attrs.append(p)
+
+ return attrs
+
+ def add_cookie_header(self, request):
+ """Add correct Cookie: header to request (urllib2.Request object).
+
+ The Cookie2 header is also added unless policy.hide_cookie2 is true.
+
+ The request object (usually a urllib2.Request instance) must support
+ the methods get_full_url, get_host, get_type, has_header, get_header,
+ header_items and add_unredirected_header, as documented by urllib2, and
+ the port attribute (the port number). Actually,
+ RequestUpgradeProcessor will automatically upgrade your Request object
+ to one with has_header, get_header, header_items and
+ add_unredirected_header, if it lacks those methods, for compatibility
+ with pre-2.4 versions of urllib2.
+
+ """
+ debug("add_cookie_header")
+ self._policy._now = self._now = int(time.time())
+
+ req_host, erhn = eff_request_host(request)
+ strict_non_domain = (
+ self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain)
+
+ cookies = self._cookies_for_request(request)
+
+ attrs = self._cookie_attrs(cookies)
+ if attrs:
+ if not request.has_header("Cookie"):
+ request.add_unredirected_header(
+ "Cookie", string.join(attrs, "; "))
+
+ # if necessary, advertise that we know RFC 2965
+ if self._policy.rfc2965 and not self._policy.hide_cookie2:
+ for cookie in cookies:
+ if cookie.version != 1 and not request.has_header("Cookie2"):
+ request.add_unredirected_header("Cookie2", '$Version="1"')
+ break
+
+ self.clear_expired_cookies()
+
+ def _normalized_cookie_tuples(self, attrs_set):
+ """Return list of tuples containing normalised cookie information.
+
+ attrs_set is the list of lists of key,value pairs extracted from
+ the Set-Cookie or Set-Cookie2 headers.
+
+ Tuples are name, value, standard, rest, where name and value are the
+ cookie name and value, standard is a dictionary containing the standard
+ cookie-attributes (discard, secure, version, expires or max-age,
+ domain, path and port) and rest is a dictionary containing the rest of
+ the cookie-attributes.
+
+ """
+ cookie_tuples = []
+
+ boolean_attrs = "discard", "secure"
+ value_attrs = ("version",
+ "expires", "max-age",
+ "domain", "path", "port",
+ "comment", "commenturl")
+
+ for cookie_attrs in attrs_set:
+ name, value = cookie_attrs[0]
+
+ # Build dictionary of standard cookie-attributes (standard) and
+ # dictionary of other cookie-attributes (rest).
+
+ # Note: expiry time is normalised to seconds since epoch. V0
+ # cookies should have the Expires cookie-attribute, and V1 cookies
+ # should have Max-Age, but since V1 includes RFC 2109 cookies (and
+ # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
+ # accept either (but prefer Max-Age).
+ max_age_set = False
+
+ bad_cookie = False
+
+ standard = {}
+ rest = {}
+ for k, v in cookie_attrs[1:]:
+ lc = string.lower(k)
+ # don't lose case distinction for unknown fields
+ if lc in value_attrs or lc in boolean_attrs:
+ k = lc
+ if k in boolean_attrs and v is None:
+ # boolean cookie-attribute is present, but has no value
+ # (like "discard", rather than "port=80")
+ v = True
+ if standard.has_key(k):
+ # only first value is significant
+ continue
+ if k == "domain":
+ if v is None:
+ debug(" missing value for domain attribute")
+ bad_cookie = True
+ break
+ # RFC 2965 section 3.3.3
+ v = string.lower(v)
+ if k == "expires":
+ if max_age_set:
+ # Prefer max-age to expires (like Mozilla)
+ continue
+ if v is None:
+ debug(" missing or invalid value for expires "
+ "attribute: treating as session cookie")
+ continue
+ if k == "max-age":
+ max_age_set = True
+ try:
+ v = int(v)
+ except ValueError:
+ debug(" missing or invalid (non-numeric) value for "
+ "max-age attribute")
+ bad_cookie = True
+ break
+ # convert RFC 2965 Max-Age to seconds since epoch
+ # XXX Strictly you're supposed to follow RFC 2616
+ # age-calculation rules. Remember that zero Max-Age is a
+ # is a request to discard (old and new) cookie, though.
+ k = "expires"
+ v = self._now + v
+ if (k in value_attrs) or (k in boolean_attrs):
+ if (v is None and
+ k not in ["port", "comment", "commenturl"]):
+ debug(" missing value for %s attribute" % k)
+ bad_cookie = True
+ break
+ standard[k] = v
+ else:
+ rest[k] = v
+
+ if bad_cookie:
+ continue
+
+ cookie_tuples.append((name, value, standard, rest))
+
+ return cookie_tuples
+
+ def _cookie_from_cookie_tuple(self, tup, request):
+ # standard is dict of standard cookie-attributes, rest is dict of the
+ # rest of them
+ name, value, standard, rest = tup
+
+ domain = standard.get("domain", Absent)
+ path = standard.get("path", Absent)
+ port = standard.get("port", Absent)
+ expires = standard.get("expires", Absent)
+
+ # set the easy defaults
+ version = standard.get("version", None)
+ if version is not None: version = int(version)
+ secure = standard.get("secure", False)
+ # (discard is also set if expires is Absent)
+ discard = standard.get("discard", False)
+ comment = standard.get("comment", None)
+ comment_url = standard.get("commenturl", None)
+
+ # set default path
+ if path is not Absent and path != "":
+ path_specified = True
+ path = escape_path(path)
+ else:
+ path_specified = False
+ path = request_path(request)
+ i = string.rfind(path, "/")
+ if i != -1:
+ if version == 0:
+ # Netscape spec parts company from reality here
+ path = path[:i]
+ else:
+ path = path[:i+1]
+ if len(path) == 0: path = "/"
+
+ # set default domain
+ domain_specified = domain is not Absent
+ # but first we have to remember whether it starts with a dot
+ domain_initial_dot = False
+ if domain_specified:
+ domain_initial_dot = bool(startswith(domain, "."))
+ if domain is Absent:
+ req_host, erhn = eff_request_host(request)
+ domain = erhn
+ elif not startswith(domain, "."):
+ domain = "."+domain
+
+ # set default port
+ port_specified = False
+ if port is not Absent:
+ if port is None:
+ # Port attr present, but has no value: default to request port.
+ # Cookie should then only be sent back on that port.
+ port = request_port(request)
+ else:
+ port_specified = True
+ port = re.sub(r"\s+", "", port)
+ else:
+ # No port attr present. Cookie can be sent back on any port.
+ port = None
+
+ # set default expires and discard
+ if expires is Absent:
+ expires = None
+ discard = True
+ elif expires <= self._now:
+ # Expiry date in past is request to delete cookie. This can't be
+ # in DefaultCookiePolicy, because can't delete cookies there.
+ try:
+ self.clear(domain, path, name)
+ except KeyError:
+ pass
+ debug("Expiring cookie, domain='%s', path='%s', name='%s'",
+ domain, path, name)
+ return None
+
+ return Cookie(version,
+ name, value,
+ port, port_specified,
+ domain, domain_specified, domain_initial_dot,
+ path, path_specified,
+ secure,
+ expires,
+ discard,
+ comment,
+ comment_url,
+ rest)
+
+ def _cookies_from_attrs_set(self, attrs_set, request):
+ cookie_tuples = self._normalized_cookie_tuples(attrs_set)
+
+ cookies = []
+ for tup in cookie_tuples:
+ cookie = self._cookie_from_cookie_tuple(tup, request)
+ if cookie: cookies.append(cookie)
+ return cookies
+
+ def _process_rfc2109_cookies(self, cookies):
+ if self._policy.rfc2109_as_netscape is None:
+ rfc2109_as_netscape = not self._policy.rfc2965
+ else:
+ rfc2109_as_netscape = self._policy.rfc2109_as_netscape
+ for cookie in cookies:
+ if cookie.version == 1:
+ cookie.rfc2109 = True
+ if rfc2109_as_netscape:
+ # treat 2109 cookies as Netscape cookies rather than
+ # as RFC2965 cookies
+ cookie.version = 0
+
+ def make_cookies(self, response, request):
+ """Return sequence of Cookie objects extracted from response object.
+
+ See extract_cookies.__doc__ for the interfaces required of the
+ response and request arguments.
+
+ """
+ # get cookie-attributes for RFC 2965 and Netscape protocols
+ headers = response.info()
+ rfc2965_hdrs = getheaders(headers, "Set-Cookie2")
+ ns_hdrs = getheaders(headers, "Set-Cookie")
+
+ rfc2965 = self._policy.rfc2965
+ netscape = self._policy.netscape
+
+ if ((not rfc2965_hdrs and not ns_hdrs) or
+ (not ns_hdrs and not rfc2965) or
+ (not rfc2965_hdrs and not netscape) or
+ (not netscape and not rfc2965)):
+ return [] # no relevant cookie headers: quick exit
+
+ try:
+ cookies = self._cookies_from_attrs_set(
+ split_header_words(rfc2965_hdrs), request)
+ except:
+ reraise_unmasked_exceptions()
+ cookies = []
+
+ if ns_hdrs and netscape:
+ try:
+ # RFC 2109 and Netscape cookies
+ ns_cookies = self._cookies_from_attrs_set(
+ parse_ns_headers(ns_hdrs), request)
+ except:
+ reraise_unmasked_exceptions()
+ ns_cookies = []
+ self._process_rfc2109_cookies(ns_cookies)
+
+ # Look for Netscape cookies (from Set-Cookie headers) that match
+ # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
+ # For each match, keep the RFC 2965 cookie and ignore the Netscape
+ # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
+ # bundled in with the Netscape cookies for this purpose, which is
+ # reasonable behaviour.
+ if rfc2965:
+ lookup = {}
+ for cookie in cookies:
+ lookup[(cookie.domain, cookie.path, cookie.name)] = None
+
+ def no_matching_rfc2965(ns_cookie, lookup=lookup):
+ key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
+ return not lookup.has_key(key)
+ ns_cookies = filter(no_matching_rfc2965, ns_cookies)
+
+ if ns_cookies:
+ cookies.extend(ns_cookies)
+
+ return cookies
+
+ def set_cookie_if_ok(self, cookie, request):
+ """Set a cookie if policy says it's OK to do so.
+
+ cookie: mechanize.Cookie instance
+ request: see extract_cookies.__doc__ for the required interface
+
+ """
+ self._policy._now = self._now = int(time.time())
+
+ if self._policy.set_ok(cookie, request):
+ self.set_cookie(cookie)
+
+ def set_cookie(self, cookie):
+ """Set a cookie, without checking whether or not it should be set.
+
+ cookie: mechanize.Cookie instance
+ """
+ c = self._cookies
+ if not c.has_key(cookie.domain): c[cookie.domain] = {}
+ c2 = c[cookie.domain]
+ if not c2.has_key(cookie.path): c2[cookie.path] = {}
+ c3 = c2[cookie.path]
+ c3[cookie.name] = cookie
+
+ def extract_cookies(self, response, request):
+ """Extract cookies from response, where allowable given the request.
+
+ Look for allowable Set-Cookie: and Set-Cookie2: headers in the response
+ object passed as argument. Any of these headers that are found are
+ used to update the state of the object (subject to the policy.set_ok
+ method's approval).
+
+ The response object (usually be the result of a call to
+ mechanize.urlopen, or similar) should support an info method, which
+ returns a mimetools.Message object (in fact, the 'mimetools.Message
+ object' may be any object that provides a getallmatchingheaders
+ method).
+
+ The request object (usually a urllib2.Request instance) must support
+ the methods get_full_url and get_host, as documented by urllib2, and
+ the port attribute (the port number). The request is used to set
+ default values for cookie-attributes as well as for checking that the
+ cookie is OK to be set.
+
+ """
+ debug("extract_cookies: %s", response.info())
+ self._policy._now = self._now = int(time.time())
+
+ for cookie in self.make_cookies(response, request):
+ if self._policy.set_ok(cookie, request):
+ debug(" setting cookie: %s", cookie)
+ self.set_cookie(cookie)
+
+ def clear(self, domain=None, path=None, name=None):
+ """Clear some cookies.
+
+ Invoking this method without arguments will clear all cookies. If
+ given a single argument, only cookies belonging to that domain will be
+ removed. If given two arguments, cookies belonging to the specified
+ path within that domain are removed. If given three arguments, then
+ the cookie with the specified name, path and domain is removed.
+
+ Raises KeyError if no matching cookie exists.
+
+ """
+ if name is not None:
+ if (domain is None) or (path is None):
+ raise ValueError(
+ "domain and path must be given to remove a cookie by name")
+ del self._cookies[domain][path][name]
+ elif path is not None:
+ if domain is None:
+ raise ValueError(
+ "domain must be given to remove cookies by path")
+ del self._cookies[domain][path]
+ elif domain is not None:
+ del self._cookies[domain]
+ else:
+ self._cookies = {}
+
+ def clear_session_cookies(self):
+ """Discard all session cookies.
+
+ Discards all cookies held by object which had either no Max-Age or
+ Expires cookie-attribute or an explicit Discard cookie-attribute, or
+ which otherwise have ended up with a true discard attribute. For
+ interactive browsers, the end of a session usually corresponds to
+ closing the browser window.
+
+ Note that the save method won't save session cookies anyway, unless you
+ ask otherwise by passing a true ignore_discard argument.
+
+ """
+ for cookie in self:
+ if cookie.discard:
+ self.clear(cookie.domain, cookie.path, cookie.name)
+
+ def clear_expired_cookies(self):
+ """Discard all expired cookies.
+
+ You probably don't need to call this method: expired cookies are never
+ sent back to the server (provided you're using DefaultCookiePolicy),
+ this method is called by CookieJar itself every so often, and the save
+ method won't save expired cookies anyway (unless you ask otherwise by
+ passing a true ignore_expires argument).
+
+ """
+ now = time.time()
+ for cookie in self:
+ if cookie.is_expired(now):
+ self.clear(cookie.domain, cookie.path, cookie.name)
+
+ def __getitem__(self, i):
+ if i == 0:
+ self._getitem_iterator = self.__iter__()
+ elif self._prev_getitem_index != i-1: raise IndexError(
+ "CookieJar.__getitem__ only supports sequential iteration")
+ self._prev_getitem_index = i
+ try:
+ return self._getitem_iterator.next()
+ except StopIteration:
+ raise IndexError()
+
+ def __iter__(self):
+ return MappingIterator(self._cookies)
+
+ def __len__(self):
+ """Return number of contained cookies."""
+ i = 0
+ for cookie in self: i = i + 1
+ return i
+
+ def __repr__(self):
+ r = []
+ for cookie in self: r.append(repr(cookie))
+ return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+
+ def __str__(self):
+ r = []
+ for cookie in self: r.append(str(cookie))
+ return "<%s[%s]>" % (self.__class__, string.join(r, ", "))
+
+
+class LoadError(Exception): pass
+
+class FileCookieJar(CookieJar):
+ """CookieJar that can be loaded from and saved to a file.
+
+ Additional methods
+
+ save(filename=None, ignore_discard=False, ignore_expires=False)
+ load(filename=None, ignore_discard=False, ignore_expires=False)
+ revert(filename=None, ignore_discard=False, ignore_expires=False)
+
+ Additional public attributes
+
+ filename: filename for loading and saving cookies
+
+ Additional public readable attributes
+
+ delayload: request that cookies are lazily loaded from disk; this is only
+ a hint since this only affects performance, not behaviour (unless the
+ cookies on disk are changing); a CookieJar object may ignore it (in fact,
+ only MSIECookieJar lazily loads cookies at the moment)
+
+ """
+
+ def __init__(self, filename=None, delayload=False, policy=None):
+ """
+ See FileCookieJar.__doc__ for argument documentation.
+
+ Cookies are NOT loaded from the named file until either the load or
+ revert method is called.
+
+ """
+ CookieJar.__init__(self, policy)
+ if filename is not None and not isstringlike(filename):
+ raise ValueError("filename must be string-like")
+ self.filename = filename
+ self.delayload = bool(delayload)
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ """Save cookies to a file.
+
+ filename: name of file in which to save cookies
+ ignore_discard: save even cookies set to be discarded
+ ignore_expires: save even cookies that have expired
+
+ The file is overwritten if it already exists, thus wiping all its
+ cookies. Saved cookies can be restored later using the load or revert
+ methods. If filename is not specified, self.filename is used; if
+ self.filename is None, ValueError is raised.
+
+ """
+ raise NotImplementedError()
+
+ def load(self, filename=None, ignore_discard=False, ignore_expires=False):
+ """Load cookies from a file.
+
+ Old cookies are kept unless overwritten by newly loaded ones.
+
+ Arguments are as for .save().
+
+ If filename is not specified, self.filename is used; if self.filename
+ is None, ValueError is raised. The named file must be in the format
+ understood by the class, or LoadError will be raised. This format will
+ be identical to that written by the save method, unless the load format
+ is not sufficiently well understood (as is the case for MSIECookieJar).
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename)
+ try:
+ self._really_load(f, filename, ignore_discard, ignore_expires)
+ finally:
+ f.close()
+
+ def revert(self, filename=None,
+ ignore_discard=False, ignore_expires=False):
+ """Clear all cookies and reload cookies from a saved file.
+
+ Raises LoadError (or IOError) if reversion is not successful; the
+ object's state will not be altered if this happens.
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ old_state = copy.deepcopy(self._cookies)
+ self._cookies = {}
+ try:
+ self.load(filename, ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ self._cookies = old_state
+ raise
Property changes on: Zope3/trunk/src/mechanize/_clientcookie.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_gzip.py
===================================================================
--- Zope3/trunk/src/mechanize/_gzip.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_gzip.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,103 @@
+import urllib2
+from cStringIO import StringIO
+import _util
+
+# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
+class GzipConsumer:
+
+ def __init__(self, consumer):
+ self.__consumer = consumer
+ self.__decoder = None
+ self.__data = ""
+
+ def __getattr__(self, key):
+ return getattr(self.__consumer, key)
+
+ def feed(self, data):
+ if self.__decoder is None:
+ # check if we have a full gzip header
+ data = self.__data + data
+ try:
+ i = 10
+ flag = ord(data[3])
+ if flag & 4: # extra
+ x = ord(data[i]) + 256*ord(data[i+1])
+ i = i + 2 + x
+ if flag & 8: # filename
+ while ord(data[i]):
+ i = i + 1
+ i = i + 1
+ if flag & 16: # comment
+ while ord(data[i]):
+ i = i + 1
+ i = i + 1
+ if flag & 2: # crc
+ i = i + 2
+ if len(data) < i:
+ raise IndexError("not enough data")
+ if data[:3] != "\x1f\x8b\x08":
+ raise IOError("invalid gzip data")
+ data = data[i:]
+ except IndexError:
+ self.__data = data
+ return # need more data
+ import zlib
+ self.__data = ""
+ self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS)
+ data = self.__decoder.decompress(data)
+ if data:
+ self.__consumer.feed(data)
+
+ def close(self):
+ if self.__decoder:
+ data = self.__decoder.flush()
+ if data:
+ self.__consumer.feed(data)
+ self.__consumer.close()
+
+
+# --------------------------------------------------------------------
+
+# the rest of this module is John Lee's stupid code, not
+# Fredrik's nice code :-)
+
+class stupid_gzip_consumer:
+ def __init__(self): self.data = []
+ def feed(self, data): self.data.append(data)
+
+class stupid_gzip_wrapper(_util.closeable_response):
+ def __init__(self, response):
+ self._response = response
+
+ c = stupid_gzip_consumer()
+ gzc = GzipConsumer(c)
+ gzc.feed(response.read())
+ self.__data = StringIO("".join(c.data))
+
+ def read(self, size=-1):
+ return self.__data.read(size)
+ def readline(self, size=-1):
+ return self.__data.readline(size)
+ def readlines(self, sizehint=-1):
+ return self.__data.readlines(size)
+
+ def __getattr__(self, name):
+ # delegate unknown methods/attributes
+ return getattr(self._response, name)
+
+class HTTPGzipProcessor(urllib2.BaseHandler):
+ handler_order = 200 # response processing before HTTPEquivProcessor
+
+ def http_request(self, request):
+ request.add_header("Accept-Encoding", "gzip")
+ return request
+
+ def http_response(self, request, response):
+ # post-process response
+ enc_hdrs = response.info().getheaders("Content-encoding")
+ for enc_hdr in enc_hdrs:
+ if ("gzip" in enc_hdr) or ("compress" in enc_hdr):
+ return stupid_gzip_wrapper(response)
+ return response
+
+ https_response = http_response
Property changes on: Zope3/trunk/src/mechanize/_gzip.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_headersutil.py
===================================================================
--- Zope3/trunk/src/mechanize/_headersutil.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_headersutil.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,225 @@
+"""Utility functions for HTTP header value parsing and construction.
+
+Copyright 1997-1998, Gisle Aas
+Copyright 2002-2006, John J. Lee
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import os, re, string, urlparse
+from types import StringType
+from types import UnicodeType
+STRING_TYPES = StringType, UnicodeType
+
+from _util import startswith, endswith, http2time
+
+def is_html(ct_headers, url, allow_xhtml=False):
+ """
+ ct_headers: Sequence of Content-Type headers
+ url: Response URL
+
+ """
+ if not ct_headers:
+ # guess
+ ext = os.path.splitext(urlparse.urlparse(url)[2])[1]
+ html_exts = [".htm", ".html"]
+ if allow_xhtml:
+ html_exts += [".xhtml"]
+ return ext in html_exts
+ # use first header
+ ct = split_header_words(ct_headers)[0][0][0]
+ html_types = ["text/html"]
+ if allow_xhtml:
+ html_types += [
+ "text/xhtml", "text/xml",
+ "application/xml", "application/xhtml+xml",
+ ]
+ return ct in html_types
+
+def unmatched(match):
+ """Return unmatched part of re.Match object."""
+ start, end = match.span(0)
+ return match.string[:start]+match.string[end:]
+
+token_re = re.compile(r"^\s*([^=\s;,]+)")
+quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
+value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
+escape_re = re.compile(r"\\(.)")
+def split_header_words(header_values):
+ r"""Parse header values into a list of lists containing key,value pairs.
+
+ The function knows how to deal with ",", ";" and "=" as well as quoted
+ values after "=". A list of space separated tokens are parsed as if they
+ were separated by ";".
+
+ If the header_values passed as argument contains multiple values, then they
+ are treated as if they were a single value separated by comma ",".
+
+ This means that this function is useful for parsing header fields that
+ follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
+ the requirement for tokens).
+
+ headers = #header
+ header = (token | parameter) *( [";"] (token | parameter))
+
+ token = 1*<any CHAR except CTLs or separators>
+ separators = "(" | ")" | "<" | ">" | "@"
+ | "," | ";" | ":" | "\" | <">
+ | "/" | "[" | "]" | "?" | "="
+ | "{" | "}" | SP | HT
+
+ quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
+ qdtext = <any TEXT except <">>
+ quoted-pair = "\" CHAR
+
+ parameter = attribute "=" value
+ attribute = token
+ value = token | quoted-string
+
+ Each header is represented by a list of key/value pairs. The value for a
+ simple token (not part of a parameter) is None. Syntactically incorrect
+ headers will not necessarily be parsed as you would want.
+
+ This is easier to describe with some examples:
+
+ >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
+ [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
+ >>> split_header_words(['text/html; charset="iso-8859-1"'])
+ [[('text/html', None), ('charset', 'iso-8859-1')]]
+ >>> split_header_words([r'Basic realm="\"foo\bar\""'])
+ [[('Basic', None), ('realm', '"foobar"')]]
+
+ """
+ assert type(header_values) not in STRING_TYPES
+ result = []
+ for text in header_values:
+ orig_text = text
+ pairs = []
+ while text:
+ m = token_re.search(text)
+ if m:
+ text = unmatched(m)
+ name = m.group(1)
+ m = quoted_value_re.search(text)
+ if m: # quoted value
+ text = unmatched(m)
+ value = m.group(1)
+ value = escape_re.sub(r"\1", value)
+ else:
+ m = value_re.search(text)
+ if m: # unquoted value
+ text = unmatched(m)
+ value = m.group(1)
+ value = string.rstrip(value)
+ else:
+ # no value, a lone token
+ value = None
+ pairs.append((name, value))
+ elif startswith(string.lstrip(text), ","):
+ # concatenated headers, as per RFC 2616 section 4.2
+ text = string.lstrip(text)[1:]
+ if pairs: result.append(pairs)
+ pairs = []
+ else:
+ # skip junk
+ non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
+ assert nr_junk_chars > 0, (
+ "split_header_words bug: '%s', '%s', %s" %
+ (orig_text, text, pairs))
+ text = non_junk
+ if pairs: result.append(pairs)
+ return result
+
+join_escape_re = re.compile(r"([\"\\])")
+def join_header_words(lists):
+ """Do the inverse of the conversion done by split_header_words.
+
+ Takes a list of lists of (key, value) pairs and produces a single header
+ value. Attribute values are quoted if needed.
+
+ >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
+ 'text/plain; charset="iso-8859/1"'
+ >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
+ 'text/plain, charset="iso-8859/1"'
+
+ """
+ headers = []
+ for pairs in lists:
+ attr = []
+ for k, v in pairs:
+ if v is not None:
+ if not re.search(r"^\w+$", v):
+ v = join_escape_re.sub(r"\\\1", v) # escape " and \
+ v = '"%s"' % v
+ if k is None: # Netscape cookies may have no name
+ k = v
+ else:
+ k = "%s=%s" % (k, v)
+ attr.append(k)
+ if attr: headers.append(string.join(attr, "; "))
+ return string.join(headers, ", ")
+
+def parse_ns_headers(ns_headers):
+ """Ad-hoc parser for Netscape protocol cookie-attributes.
+
+ The old Netscape cookie format for Set-Cookie can for instance contain
+ an unquoted "," in the expires field, so we have to use this ad-hoc
+ parser instead of split_header_words.
+
+ XXX This may not make the best possible effort to parse all the crap
+ that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
+ parser is probably better, so could do worse than following that if
+ this ever gives any trouble.
+
+ Currently, this is also used for parsing RFC 2109 cookies.
+
+ """
+ known_attrs = ("expires", "domain", "path", "secure",
+ # RFC 2109 attrs (may turn up in Netscape cookies, too)
+ "port", "max-age")
+
+ result = []
+ for ns_header in ns_headers:
+ pairs = []
+ version_set = False
+ params = re.split(r";\s*", ns_header)
+ for ii in range(len(params)):
+ param = params[ii]
+ param = string.rstrip(param)
+ if param == "": continue
+ if "=" not in param:
+ k, v = param, None
+ else:
+ k, v = re.split(r"\s*=\s*", param, 1)
+ k = string.lstrip(k)
+ if ii != 0:
+ lc = string.lower(k)
+ if lc in known_attrs:
+ k = lc
+ if k == "version":
+ # This is an RFC 2109 cookie.
+ version_set = True
+ if k == "expires":
+ # convert expires date to seconds since epoch
+ if startswith(v, '"'): v = v[1:]
+ if endswith(v, '"'): v = v[:-1]
+ v = http2time(v) # None if invalid
+ pairs.append((k, v))
+
+ if pairs:
+ if not version_set:
+ pairs.append(("version", "0"))
+ result.append(pairs)
+
+ return result
+
+
+def _test():
+ import doctest, _headersutil
+ return doctest.testmod(_headersutil)
+
+if __name__ == "__main__":
+ _test()
Property changes on: Zope3/trunk/src/mechanize/_headersutil.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_html.py
===================================================================
--- Zope3/trunk/src/mechanize/_html.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_html.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,619 @@
+"""HTML handling.
+
+Copyright 2003-2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+import re, copy, urllib, htmlentitydefs
+from urlparse import urljoin
+
+import _request
+from _headersutil import split_header_words, is_html as _is_html
+
+## # XXXX miserable hack
+## def urljoin(base, url):
+## if url.startswith("?"):
+## return base+url
+## else:
+## return urlparse.urljoin(base, url)
+
+## def chr_range(a, b):
+## return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+## "abcdefghijklmnopqrstuvwxyz"
+## "-_.~")
+## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
+# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
+# 'safe'-by-default characters that urllib.urlquote never quotes
+URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+
+DEFAULT_ENCODING = "latin-1"
+
+class CachingGeneratorFunction(object):
+ """Caching wrapper around a no-arguments iterable.
+
+ >>> i = [1]
+ >>> func = CachingGeneratorFunction(i)
+ >>> list(func())
+ [1]
+ >>> list(func())
+ [1]
+
+ >>> i = [1, 2, 3]
+ >>> func = CachingGeneratorFunction(i)
+ >>> list(func())
+ [1, 2, 3]
+
+ >>> i = func()
+ >>> i.next()
+ 1
+ >>> i.next()
+ 2
+ >>> i.next()
+ 3
+
+ >>> i = func()
+ >>> j = func()
+ >>> i.next()
+ 1
+ >>> j.next()
+ 1
+ >>> i.next()
+ 2
+ >>> j.next()
+ 2
+ >>> j.next()
+ 3
+ >>> i.next()
+ 3
+ >>> i.next()
+ Traceback (most recent call last):
+ ...
+ StopIteration
+ >>> j.next()
+ Traceback (most recent call last):
+ ...
+ StopIteration
+ """
+ def __init__(self, iterable):
+ def make_gen():
+ for item in iterable:
+ yield item
+
+ self._cache = []
+ self._generator = make_gen()
+
+ def __call__(self):
+ cache = self._cache
+
+ for item in cache:
+ yield item
+ for item in self._generator:
+ cache.append(item)
+ yield item
+
+def encoding_finder(default_encoding):
+ def encoding(response):
+ # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
+ # headers may be in the response. HTTP-EQUIV headers come last,
+ # so try in order from first to last.
+ for ct in response.info().getheaders("content-type"):
+ for k, v in split_header_words([ct])[0]:
+ if k == "charset":
+ return v
+ return default_encoding
+ return encoding
+
+def make_is_html(allow_xhtml):
+ def is_html(response, encoding):
+ ct_hdrs = response.info().getheaders("content-type")
+ url = response.geturl()
+ # XXX encoding
+ return _is_html(ct_hdrs, url, allow_xhtml)
+ return is_html
+
+# idea for this argument-processing trick is from Peter Otten
+class Args:
+ def __init__(self, args_map):
+ self.dictionary = dict(args_map)
+ def __getattr__(self, key):
+ try:
+ return self.dictionary[key]
+ except KeyError:
+ return getattr(self.__class__, key)
+
+def form_parser_args(
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ ):
+ return Args(locals())
+
+
+class Link:
+ def __init__(self, base_url, url, text, tag, attrs):
+ assert None not in [url, tag, attrs]
+ self.base_url = base_url
+ self.absolute_url = urljoin(base_url, url)
+ self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
+ def __cmp__(self, other):
+ try:
+ for name in "url", "text", "tag", "attrs":
+ if getattr(self, name) != getattr(other, name):
+ return -1
+ except AttributeError:
+ return -1
+ return 0
+ def __repr__(self):
+ return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
+ self.base_url, self.url, self.text, self.tag, self.attrs)
+
+
+def clean_url(url, encoding):
+ # percent-encode illegal URL characters
+ # Trying to come up with test cases for this gave me a headache, revisit
+ # when do switch to unicode.
+ # Somebody else's comments (lost the attribution):
+## - IE will return you the url in the encoding you send it
+## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
+## characters in your link. It will send you utf-8 however if there are...
+ if type(url) == type(""):
+ url = url.decode(encoding, "replace")
+ url = url.strip()
+ return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+
+class LinksFactory:
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import _pullparser
+ if link_parser_class is None:
+ link_parser_class = _pullparser.TolerantPullParser
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+ self._response = None
+ self._encoding = None
+
+ def set_response(self, response, base_url, encoding):
+ self._response = response
+ self._encoding = encoding
+ self._base_url = base_url
+
+ def links(self):
+ """Return an iterator that provides links of the document."""
+ response = self._response
+ encoding = self._encoding
+ base_url = self._base_url
+ p = self.link_parser_class(response, encoding=encoding)
+
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.data == "base":
+ base_url = dict(token.attrs).get("href")
+ continue
+ if token.type == "endtag":
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX use attr_encoding for ref'd doc if that doc does not provide
+ # one by other means
+ #attr_encoding = attrs.get("charset")
+ url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so ignore
+ # this.
+ continue
+
+ url = clean_url(url, encoding)
+ if tag == "a":
+ if token.type != "startendtag":
+ # hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+
+ yield Link(base_url, url, text, tag, token.attrs)
+
+class FormsFactory:
+
+ """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
+
+ For constructor argument docs, see ClientForm.ParseResponse
+ argument docs.
+
+ """
+
+ def __init__(self,
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ ):
+ import ClientForm
+ self.select_default = select_default
+ if form_parser_class is None:
+ form_parser_class = ClientForm.FormParser
+ self.form_parser_class = form_parser_class
+ if request_class is None:
+ request_class = _request.Request
+ self.request_class = request_class
+ self.backwards_compat = backwards_compat
+ self._response = None
+ self.encoding = None
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self.encoding = encoding
+
+ def forms(self):
+ import ClientForm
+ encoding = self.encoding
+ return ClientForm.ParseResponse(
+ self._response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ backwards_compat=self.backwards_compat,
+ encoding=encoding,
+ )
+
+class TitleFactory:
+ def __init__(self):
+ self._response = self._encoding = None
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self._encoding = encoding
+
+ def title(self):
+ import _pullparser
+ p = _pullparser.TolerantPullParser(
+ self._response, encoding=self._encoding)
+ try:
+ p.get_tag("title")
+ except _pullparser.NoMoreTokensError:
+ return None
+ else:
+ return p.get_text()
+
+
+def unescape(data, entities, encoding):
+ if data is None or "&" not in data:
+ return data
+
+ def replace_entities(match):
+ ent = match.group()
+ if ent[1] == "#":
+ return unescape_charref(ent[2:-1], encoding)
+
+ repl = entities.get(ent[1:-1])
+ if repl is not None:
+ repl = unichr(repl)
+ if type(repl) != type(""):
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
+ else:
+ repl = ent
+ return repl
+
+ return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith("x"):
+ name, base= name[1:], 16
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
+
+
+try:
+ import BeautifulSoup
+except ImportError:
+ pass
+else:
+ import sgmllib
+ # monkeypatch to fix http://www.python.org/sf/803422 :-(
+ sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
+ class MechanizeBs(BeautifulSoup.BeautifulSoup):
+ _entitydefs = htmlentitydefs.name2codepoint
+ # don't want the magic Microsoft-char workaround
+ PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda(x):x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda(x):'<!' + x.group(1) + '>')
+ ]
+
+ def __init__(self, encoding, text=None, avoidParserProblems=True,
+ initialTextIsEverything=True):
+ self._encoding = encoding
+ BeautifulSoup.BeautifulSoup.__init__(
+ self, text, avoidParserProblems, initialTextIsEverything)
+
+ def handle_charref(self, ref):
+ t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def handle_entityref(self, ref):
+ t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ val = unescape(val, self._entitydefs, self._encoding)
+ escaped_attrs.append((key, val))
+ return escaped_attrs
+
+class RobustLinksFactory:
+
+ compress_re = re.compile(r"\s+")
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import BeautifulSoup
+ if link_parser_class is None:
+ link_parser_class = MechanizeBs
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+ self._bs = None
+ self._encoding = None
+ self._base_url = None
+
+ def set_soup(self, soup, base_url, encoding):
+ self._bs = soup
+ self._base_url = base_url
+ self._encoding = encoding
+
+ def links(self):
+ import BeautifulSoup
+ bs = self._bs
+ base_url = self._base_url
+ encoding = self._encoding
+ gen = bs.recursiveChildGenerator()
+ for ch in bs.recursiveChildGenerator():
+ if (isinstance(ch, BeautifulSoup.Tag) and
+ ch.name in self.urltags.keys()+["base"]):
+ link = ch
+ attrs = bs.unescape_attrs(link.attrs)
+ attrs_dict = dict(attrs)
+ if link.name == "base":
+ base_url = attrs_dict.get("href")
+ continue
+ url_attr = self.urltags[link.name]
+ url = attrs_dict.get(url_attr)
+ if not url:
+ continue
+ url = clean_url(url, encoding)
+ text = link.firstText(lambda t: True)
+ if text is BeautifulSoup.Null:
+ # follow _pullparser's weird behaviour rigidly
+ if link.name == "a":
+ text = ""
+ else:
+ text = None
+ else:
+ text = self.compress_re.sub(" ", text.strip())
+ yield Link(base_url, url, text, link.name, attrs)
+
+
+class RobustFormsFactory(FormsFactory):
+ def __init__(self, *args, **kwds):
+ import ClientForm
+ args = form_parser_args(*args, **kwds)
+ if args.form_parser_class is None:
+ args.form_parser_class = ClientForm.RobustFormParser
+ FormsFactory.__init__(self, **args.dictionary)
+
+ def set_response(self, response, encoding):
+ self._response = response
+ self.encoding = encoding
+
+
+class RobustTitleFactory:
+ def __init__(self):
+ self._bs = self._encoding = None
+
+ def set_soup(self, soup, encoding):
+ self._bs = soup
+ self._encoding = encoding
+
+ def title(soup):
+ import BeautifulSoup
+ title = self._bs.first("title")
+ if title == BeautifulSoup.Null:
+ return None
+ else:
+ return title.firstText(lambda t: True)
+
+
+class Factory:
+ """Factory for forms, links, etc.
+
+ This interface may expand in future.
+
+ Public methods:
+
+ set_request_class(request_class)
+ set_response(response)
+ forms()
+ links()
+
+ Public attributes:
+
+ encoding: string specifying the encoding of response if it contains a text
+ document (this value is left unspecified for documents that do not have
+ an encoding, e.g. an image file)
+ is_html: true if response contains an HTML document (XHTML may be
+ regarded as HTML too)
+ title: page title, or None if no title or not HTML
+
+ """
+
+ def __init__(self, forms_factory, links_factory, title_factory,
+ get_encoding=encoding_finder(DEFAULT_ENCODING),
+ is_html_p=make_is_html(allow_xhtml=False),
+ ):
+ """
+
+ Pass keyword arguments only.
+
+ default_encoding: character encoding to use if encoding cannot be
+ determined (or guessed) from the response. You should turn on
+ HTTP-EQUIV handling if you want the best chance of getting this right
+ without resorting to this default. The default value of this
+ parameter (currently latin-1) may change in future.
+
+ """
+ self._forms_factory = forms_factory
+ self._links_factory = links_factory
+ self._title_factory = title_factory
+ self._get_encoding = get_encoding
+ self._is_html_p = is_html_p
+
+ self.set_response(None)
+
+ def set_request_class(self, request_class):
+ """Set urllib2.Request class.
+
+ ClientForm.HTMLForm instances returned by .forms() will return
+ instances of this class when .click()ed.
+
+ """
+ self._forms_factory.request_class = request_class
+
+ def set_response(self, response):
+ """Set response.
+
+ The response must implement the same interface as objects returned by
+ urllib2.urlopen().
+
+ """
+ self._response = response
+ self._forms_genf = self._links_genf = None
+ self._get_title = None
+ for name in ["encoding", "is_html", "title"]:
+ try:
+ delattr(self, name)
+ except AttributeError:
+ pass
+
+ def __getattr__(self, name):
+ if name not in ["encoding", "is_html", "title"]:
+ return getattr(self.__class__, name)
+
+ try:
+ if name == "encoding":
+ self.encoding = self._get_encoding(self._response)
+ return self.encoding
+ elif name == "is_html":
+ self.is_html = self._is_html_p(self._response, self.encoding)
+ return self.is_html
+ elif name == "title":
+ if self.is_html:
+ self.title = self._title_factory.title()
+ else:
+ self.title = None
+ return self.title
+ finally:
+ self._response.seek(0)
+
+ def forms(self):
+ """Return iterable over ClientForm.HTMLForm-like objects."""
+ if self._forms_genf is None:
+ self._forms_genf = CachingGeneratorFunction(
+ self._forms_factory.forms())
+ return self._forms_genf()
+
+ def links(self):
+ """Return iterable over mechanize.Link-like objects."""
+ if self._links_genf is None:
+ self._links_genf = CachingGeneratorFunction(
+ self._links_factory.links())
+ return self._links_genf()
+
+class DefaultFactory(Factory):
+ """Based on sgmllib."""
+ def __init__(self, i_want_broken_xhtml_support=False):
+ Factory.__init__(
+ self,
+ forms_factory=FormsFactory(),
+ links_factory=LinksFactory(),
+ title_factory=TitleFactory(),
+ is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ )
+
+ def set_response(self, response):
+ Factory.set_response(self, response)
+ if response is not None:
+ self._forms_factory.set_response(
+ copy.copy(response), self.encoding)
+ self._links_factory.set_response(
+ copy.copy(response), self._response.geturl(), self.encoding)
+ self._title_factory.set_response(
+ copy.copy(response), self.encoding)
+
+class RobustFactory(Factory):
+ """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
+ DefaultFactory.
+
+ """
+ def __init__(self, i_want_broken_xhtml_support=False,
+ soup_class=None):
+ Factory.__init__(
+ self,
+ forms_factory=RobustFormsFactory(),
+ links_factory=RobustLinksFactory(),
+ title_factory=RobustTitleFactory(),
+ is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ )
+ if soup_class is None:
+ soup_class = MechanizeBs
+ self._soup_class = soup_class
+
+ def set_response(self, response):
+ import BeautifulSoup
+ Factory.set_response(self, response)
+ if response is not None:
+ data = response.read()
+ soup = self._soup_class(self.encoding, data)
+ self._forms_factory.set_response(response, self.encoding)
+ self._links_factory.set_soup(
+ soup, response.geturl(), self.encoding)
+ self._title_factory.set_soup(soup, self.encoding)
Property changes on: Zope3/trunk/src/mechanize/_html.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_lwpcookiejar.py
===================================================================
--- Zope3/trunk/src/mechanize/_lwpcookiejar.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_lwpcookiejar.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,185 @@
+"""Load / save to libwww-perl (LWP) format files.
+
+Actually, the format is slightly extended from that used by LWP's
+(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
+not recorded by LWP.
+
+It uses the version string "2.0", though really there isn't an LWP Cookies
+2.0 format. This indicates that there is extra information in here
+(domain_dot and port_spec) while still being compatible with libwww-perl,
+I hope.
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import time, re, string, logging
+
+from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _headersutil import join_header_words, split_header_words
+from _util import startswith, iso2time, time2isoz
+
+debug = logging.getLogger("mechanize").debug
+
+
+def lwp_cookie_str(cookie):
+ """Return string representation of Cookie in an the LWP cookie file format.
+
+ Actually, the format is extended a bit -- see module docstring.
+
+ """
+ h = [(cookie.name, cookie.value),
+ ("path", cookie.path),
+ ("domain", cookie.domain)]
+ if cookie.port is not None: h.append(("port", cookie.port))
+ if cookie.path_specified: h.append(("path_spec", None))
+ if cookie.port_specified: h.append(("port_spec", None))
+ if cookie.domain_initial_dot: h.append(("domain_dot", None))
+ if cookie.secure: h.append(("secure", None))
+ if cookie.expires: h.append(("expires",
+ time2isoz(float(cookie.expires))))
+ if cookie.discard: h.append(("discard", None))
+ if cookie.comment: h.append(("comment", cookie.comment))
+ if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
+ if cookie.rfc2109: h.append(("rfc2109", None))
+
+ keys = cookie.nonstandard_attr_keys()
+ keys.sort()
+ for k in keys:
+ h.append((k, str(cookie.get_nonstandard_attr(k))))
+
+ h.append(("version", str(cookie.version)))
+
+ return join_header_words([h])
+
+class LWPCookieJar(FileCookieJar):
+ """
+ The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
+ "Set-Cookie3" is the format used by the libwww-perl libary, not known
+ to be compatible with any browser, but which is easy to read and
+ doesn't lose information about RFC 2965 cookies.
+
+ Additional methods
+
+ as_lwp_str(ignore_discard=True, ignore_expired=True)
+
+ """
+
+ magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
+
+ def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
+ """Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
+
+ ignore_discard and ignore_expires: see docstring for FileCookieJar.save
+
+ """
+ now = time.time()
+ r = []
+ for cookie in self:
+ if not ignore_discard and cookie.discard:
+ debug(" Not saving %s: marked for discard", cookie.name)
+ continue
+ if not ignore_expires and cookie.is_expired(now):
+ debug(" Not saving %s: expired", cookie.name)
+ continue
+ r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
+ return string.join(r+[""], "\n")
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename, "w")
+ try:
+ debug("Saving LWP cookies file")
+ # There really isn't an LWP Cookies 2.0 format, but this indicates
+ # that there is extra information in here (domain_dot and
+ # port_spec) while still being compatible with libwww-perl, I hope.
+ f.write("#LWP-Cookies-2.0\n")
+ f.write(self.as_lwp_str(ignore_discard, ignore_expires))
+ finally:
+ f.close()
+
+ def _really_load(self, f, filename, ignore_discard, ignore_expires):
+ magic = f.readline()
+ if not re.search(self.magic_re, magic):
+ msg = "%s does not seem to contain cookies" % filename
+ raise LoadError(msg)
+
+ now = time.time()
+
+ header = "Set-Cookie3:"
+ boolean_attrs = ("port_spec", "path_spec", "domain_dot",
+ "secure", "discard", "rfc2109")
+ value_attrs = ("version",
+ "port", "path", "domain",
+ "expires",
+ "comment", "commenturl")
+
+ try:
+ while 1:
+ line = f.readline()
+ if line == "": break
+ if not startswith(line, header):
+ continue
+ line = string.strip(line[len(header):])
+
+ for data in split_header_words([line]):
+ name, value = data[0]
+ standard = {}
+ rest = {}
+ for k in boolean_attrs:
+ standard[k] = False
+ for k, v in data[1:]:
+ if k is not None:
+ lc = string.lower(k)
+ else:
+ lc = None
+ # don't lose case distinction for unknown fields
+ if (lc in value_attrs) or (lc in boolean_attrs):
+ k = lc
+ if k in boolean_attrs:
+ if v is None: v = True
+ standard[k] = v
+ elif k in value_attrs:
+ standard[k] = v
+ else:
+ rest[k] = v
+
+ h = standard.get
+ expires = h("expires")
+ discard = h("discard")
+ if expires is not None:
+ expires = iso2time(expires)
+ if expires is None:
+ discard = True
+ domain = h("domain")
+ domain_specified = startswith(domain, ".")
+ c = Cookie(h("version"), name, value,
+ h("port"), h("port_spec"),
+ domain, domain_specified, h("domain_dot"),
+ h("path"), h("path_spec"),
+ h("secure"),
+ expires,
+ discard,
+ h("comment"),
+ h("commenturl"),
+ rest,
+ h("rfc2109"),
+ )
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ self.set_cookie(c)
+ except:
+ reraise_unmasked_exceptions((IOError,))
+ raise LoadError("invalid Set-Cookie3 format file %s" % filename)
+
Property changes on: Zope3/trunk/src/mechanize/_lwpcookiejar.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Modified: Zope3/trunk/src/mechanize/_mechanize.py
===================================================================
--- Zope3/trunk/src/mechanize/_mechanize.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_mechanize.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -1,183 +1,91 @@
"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
-Copyright 2003-2005 John J. Lee <jjl at pobox.com>
+Copyright 2003-2006 John J. Lee <jjl at pobox.com>
Copyright 2003 Andy Lester (original Perl code)
-This code is free software; you can redistribute it and/or modify it under
-the terms of the BSD License (see the file COPYING included with the
-distribution).
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
"""
-# XXXX
-# test referer bugs (frags and don't add in redirect unless orig req had Referer)
+import urllib2, urlparse, sys, copy, re
-# XXX
-# The stuff on web page's todo list.
-# Moof's emails about response object, .back(), etc.
-
-from __future__ import generators
-
-import urllib2, urlparse, re, sys
-
-import ClientCookie
-from ClientCookie._Util import response_seek_wrapper
-from ClientCookie._HeadersUtil import split_header_words, is_html
-# serves me right for not using a version tuple...
-VERSION_RE = re.compile(r"(?P<major>\d+)\.(?P<minor>\d+)\.(?P<bugfix>\d+)"
- r"(?P<state>[ab])?(?:-pre)?(?P<pre>\d+)?$")
-def parse_version(text):
- m = VERSION_RE.match(text)
- if m is None:
- raise ValueError
- return tuple([m.groupdict()[part] for part in
- ("major", "minor", "bugfix", "state", "pre")])
-assert map(int, parse_version(ClientCookie.VERSION)[:3]) >= [1, 0, 3], \
- "ClientCookie 1.0.3 or newer is required"
-
from _useragent import UserAgent
+from _html import DefaultFactory
+from _util import response_seek_wrapper, closeable_response
+import _request
-__version__ = (0, 0, 10, "a", None) # 0.0.10a
+__version__ = (0, 1, 2, "b", None) # 0.1.2b
class BrowserStateError(Exception): pass
class LinkNotFoundError(Exception): pass
class FormNotFoundError(Exception): pass
-class Link:
- def __init__(self, base_url, url, text, tag, attrs):
- assert None not in [url, tag, attrs]
- self.base_url = base_url
- self.absolute_url = urlparse.urljoin(base_url, url)
- self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
- def __cmp__(self, other):
- try:
- for name in "url", "text", "tag", "attrs":
- if getattr(self, name) != getattr(other, name):
- return -1
- except AttributeError:
- return -1
- return 0
- def __repr__(self):
- return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
- self.base_url, self.url, self.text, self.tag, self.attrs)
+class History:
+ """
-class LinksFactory:
+ Though this will become public, the implied interface is not yet stable.
- def __init__(self,
- link_parser_class=None,
- link_class=Link,
- urltags=None,
- ):
- import pullparser
- assert pullparser.__version__[:3] >= (0, 0, 4), \
- "pullparser 0.0.4b or newer is required"
- if link_parser_class is None:
- link_parser_class = pullparser.TolerantPullParser
- self.link_parser_class = link_parser_class
- self.link_class = link_class
- if urltags is None:
- urltags = {
- "a": "href",
- "area": "href",
- "frame": "src",
- "iframe": "src",
- }
- self.urltags = urltags
-
- def links(self, fh, base_url, encoding=None):
- """Return an iterator that provides links of the document."""
- import pullparser
- p = self.link_parser_class(fh, encoding=encoding)
-
- for token in p.tags(*(self.urltags.keys()+["base"])):
- if token.data == "base":
- base_url = dict(token.attrs).get("href")
- continue
- if token.type == "endtag":
- continue
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get("name")
- text = None
- # XXX need to sort out quoting
- #url = urllib.quote_plus(attrs.get(self.urltags[tag]))
- url = attrs.get(self.urltags[tag])
- if tag == "a":
- if token.type != "startendtag":
- # XXX hmm, this'd break if end tag is missing
- text = p.get_compressed_text(("endtag", tag))
- # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
- #text = p.get_compressed_text()
- # This is a hack from WWW::Mechanize to get some really basic
- # JavaScript working, which I'm not yet convinced is a good
- # idea.
-## onClick = attrs["onclick"]
-## m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
-## if onClick and m:
-## url = m.group(1)
- if not url:
- # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
- # For our purposes a link is something with a URL, so ignore
- # this.
- continue
-
- yield Link(base_url, url, text, tag, token.attrs)
-
-class FormsFactory:
-
- """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
-
- For constructor argument docs, see ClientForm.ParseResponse
- argument docs.
-
"""
+ def __init__(self):
+ self._history = [] # LIFO
+ def add(self, request, response):
+ self._history.append((request, response))
+ def back(self, n, _response):
+ response = _response # XXX move Browser._response into this class?
+ while n > 0 or response is None:
+ try:
+ request, response = self._history.pop()
+ except IndexError:
+ raise BrowserStateError("already at start of history")
+ n -= 1
+ return request, response
+ def clear(self):
+ del self._history[:]
+ def close(self):
+ for request, response in self._history:
+ response.close()
+ del self._history[:]
- def __init__(self,
- select_default=False,
- form_parser_class=None,
- request_class=None,
- backwards_compat=False,
- ):
- import ClientForm
- assert map(int, parse_version(ClientForm.VERSION)[:3]) >= [0, 2, 1], \
- "ClientForm >= 0.2.1a is required"
- self.select_default = select_default
- if form_parser_class is None:
- form_parser_class = ClientForm.FormParser
- self.form_parser_class = form_parser_class
- if request_class is None:
- request_class = ClientCookie.Request
- self.request_class = request_class
- self.backwards_compat = backwards_compat
+# Horrible, but needed, at least until fork urllib2. Even then, may want
+# to preseve urllib2 compatibility.
+def upgrade_response(response):
+ # a urllib2 handler constructed the response, i.e. the response is an
+ # urllib.addinfourl, instead of a _Util.closeable_response as returned
+ # by e.g. mechanize.HTTPHandler
+ try:
+ code = response.code
+ except AttributeError:
+ code = None
+ try:
+ msg = response.msg
+ except AttributeError:
+ msg = None
- def parse_response(self, response):
- import ClientForm
- return ClientForm.ParseResponse(
- response,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- )
+ # may have already-.read() data from .seek() cache
+ data = None
+ get_data = getattr(response, "get_data", None)
+ if get_data:
+ data = get_data()
- def parse_file(self, file_obj, base_url):
- import ClientForm
- return ClientForm.ParseFile(
- file_obj,
- base_url,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- )
+ response = closeable_response(
+ response.fp, response.info(), response.geturl(), code, msg)
+ response = response_seek_wrapper(response)
+ if data:
+ response.set_data(data)
+ return response
+class ResponseUpgradeProcessor(urllib2.BaseHandler):
+ # upgrade responses to be .close()able without becoming unusable
+ handler_order = 0 # before anything else
+ def any_response(self, request, response):
+ if not hasattr(response, 'closeable_response'):
+ response = upgrade_response(response)
+ return response
-if sys.version_info[:2] >= (2, 4):
- from ClientCookie._Opener import OpenerMixin
-else:
- class OpenerMixin: pass
-class Browser(UserAgent, OpenerMixin):
+class Browser(UserAgent):
"""Browser-like class with support for history, forms and links.
BrowserStateError is raised whenever the browser is in the wrong state to
@@ -187,56 +95,57 @@
Public attributes:
- request: last request (ClientCookie.Request or urllib2.Request)
+ request: current request (mechanize.Request or urllib2.Request)
form: currently selected form (see .select_form())
- default_encoding: character encoding used for encoding numeric character
- references when matching link text, if no encoding is found in the reponse
- (you should turn on HTTP-EQUIV handling if you want the best chance of
- getting this right without resorting to this default)
"""
- def __init__(self, default_encoding="latin-1",
- forms_factory=None,
- links_factory=None,
+ handler_classes = UserAgent.handler_classes.copy()
+ handler_classes["_response_upgrade"] = ResponseUpgradeProcessor
+ default_others = copy.copy(UserAgent.default_others)
+ default_others.append("_response_upgrade")
+
+ def __init__(self,
+ factory=None,
+ history=None,
request_class=None,
):
"""
Only named arguments should be passed to this constructor.
- default_encoding: See class docs.
- forms_factory: Object supporting the mechanize.FormsFactory interface.
- links_factory: Object supporting the mechanize.LinksFactory interface.
- request_class: Request class to use. Defaults to ClientCookie.Request
+ factory: object implementing the mechanize.Factory interface.
+ history: object implementing the mechanize.History interface. Note this
+ interface is still experimental and may change in future.
+ request_class: Request class to use. Defaults to mechanize.Request
by default for Pythons older than 2.4, urllib2.Request otherwise.
- Note that the supplied forms_factory's request_class attribute is
- assigned to by this constructor, to ensure only one Request class is
- used.
+ The Factory and History objects passed in are 'owned' by the Browser,
+ so they should not be shared across Browsers. In particular,
+ factory.set_response() should not be called except by the owning
+ Browser itself.
+ Note that the supplied factory's request_class is overridden by this
+ constructor, to ensure only one Request class is used.
+
"""
- self.default_encoding = default_encoding
- self._history = [] # LIFO
+ if history is None:
+ history = History()
+ self._history = history
self.request = self._response = None
self.form = None
- self._forms = None
- self._title = None
- self._links = None
if request_class is None:
if not hasattr(urllib2.Request, "add_unredirected_header"):
- request_class = ClientCookie.Request
+ request_class = _request.Request
else:
- request_class = urllib2.Request # Python 2.4
+ request_class = urllib2.Request # Python >= 2.4
+
+ if factory is None:
+ factory = DefaultFactory()
+ factory.set_request_class(request_class)
+ self._factory = factory
self.request_class = request_class
- if forms_factory is None:
- forms_factory = FormsFactory()
- self._forms_factory = forms_factory
- forms_factory.request_class = request_class
- if links_factory is None:
- links_factory = LinksFactory()
- self._links_factory = links_factory
UserAgent.__init__(self) # do this last to avoid __getattr__ problems
@@ -244,8 +153,9 @@
if self._response is not None:
self._response.close()
UserAgent.close(self)
- del self._history[:]
- self._forms = self._title = self._links = None
+ if self._history is not None:
+ self._history.close()
+ self._history = None
self.request = self._response = None
def open(self, url, data=None):
@@ -268,25 +178,74 @@
url = urlparse.urljoin(self._response.geturl(), url)
if self.request is not None and update_history:
- self._history.append((self.request, self._response))
+ self._history.add(self.request, self._response)
self._response = None
# we want self.request to be assigned even if UserAgent.open fails
self.request = self._request(url, data)
self._previous_scheme = self.request.get_type()
- self._response = UserAgent.open(self, self.request, data)
- if not hasattr(self._response, "seek"):
- self._response = response_seek_wrapper(self._response)
- self._parse_html(self._response)
+ success = True
+ try:
+ response = UserAgent.open(self, self.request, data)
+ except urllib2.HTTPError, error:
+ success = False
+ response = error
+## except (IOError, socket.error, OSError), error:
+## # Yes, urllib2 really does raise all these :-((
+## # See test_urllib2.py for examples of socket.gaierror and OSError,
+## # plus note that FTPHandler raises IOError.
+## # XXX I don't seem to have an example of exactly socket.error being
+## # raised, only socket.gaierror...
+## # I don't want to start fixing these here, though, since this is a
+## # subclass of OpenerDirector, and it would break old code. Even in
+## # Python core, a fix would need some backwards-compat. hack to be
+## # acceptable.
+## raise
+ self.set_response(response)
+ if not success:
+ raise error
+ return copy.copy(self._response)
- return self._response
+ def __str__(self):
+ text = []
+ text.append("<%s " % self.__class__.__name__)
+ if self._response:
+ text.append("visiting %s" % self._response.geturl())
+ else:
+ text.append("(not visiting a URL)")
+ if self.form:
+ text.append("\n selected form:\n %s\n" % str(self.form))
+ text.append(">")
+ return "".join(text)
def response(self):
- """Return last response (as return value of urllib2.urlopen())."""
- # XXX This is currently broken: responses returned by this method
- # all share the same seek position.
- return self._response
+ """Return a copy of the current response.
+ The returned object has the same interface as the object returned by
+ .open() (or urllib2.urlopen()).
+
+ """
+ return copy.copy(self._response)
+
+ def set_response(self, response):
+ """Replace current response with (a copy of) response."""
+ # sanity check, necessary but far from sufficient
+ if not (hasattr(response, "info") and hasattr(response, "geturl") and
+ hasattr(response, "read")):
+ raise ValueError("not a response object")
+
+ self.form = None
+
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ if not hasattr(response, "closeable_response"):
+ response = upgrade_response(response)
+ else:
+ response = copy.copy(response)
+
+ self._response = response
+ self._factory.set_response(self._response)
+
def geturl(self):
"""Get URL of current document."""
if self._response is None:
@@ -309,45 +268,23 @@
"""
if self._response is not None:
self._response.close()
- while n > 0 or self._response is None:
- try:
- self.request, self._response = self._history.pop()
- except IndexError:
- raise BrowserStateError("already at start of history")
- n -= 1
- self._parse_html(self._response)
- return self._response
+ self.request, response = self._history.back(n, self._response)
+ self.set_response(response)
+ return response
+ def clear_history(self):
+ self._history.clear()
+
def links(self, **kwds):
"""Return iterable over links (mechanize.Link objects)."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
+ links = self._factory.links()
if kwds:
- return self._find_links(False, **kwds)
- if self._links is None:
- try:
- self._links = list(self.get_links_iter())
- finally:
- self._response.seek(0)
- return self._links
+ return self._filter_links(links, **kwds)
+ else:
+ return links
- def get_links_iter(self):
- """Return an iterator that provides links of the document.
-
- This method is provided in addition to .links() to allow lazy iteration
- over links, while still keeping .links() safe against somebody
- .seek()ing on a response "behind your back". When response objects are
- fixed to have independent seek positions, this method will be
- deprecated in favour of .links().
-
- """
- if not self.viewing_html():
- raise BrowserStateError("not viewing HTML")
- base_url = self._response.geturl()
- self._response.seek(0)
- return self._links_factory.links(
- self._response, base_url, self._encoding(self._response))
-
def forms(self):
"""Return iterable over forms.
@@ -356,23 +293,20 @@
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
- if self._forms is None:
- response = self._response
- response.seek(0)
- try:
- self._forms = self._forms_factory.parse_response(response)
- finally:
- response.seek(0)
- return self._forms
+ return self._factory.forms()
def viewing_html(self):
"""Return whether the current response contains HTML data."""
if self._response is None:
raise BrowserStateError("not viewing any document")
- ct_hdrs = self._response.info().getheaders("content-type")
- url = self._response.geturl()
- return is_html(ct_hdrs, url)
+ return self._factory.is_html
+ def encoding(self):
+ """"""
+ if self._response is None:
+ raise BrowserStateError("not viewing any document")
+ return self._factory.encoding
+
def title(self):
"""Return title, or None if there is no title element in the document.
@@ -380,27 +314,18 @@
PullParser.get_text() method of pullparser module.
"""
- import pullparser
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
- if self._title is None:
- p = pullparser.TolerantPullParser(
- self._response, encoding=self._encoding(self._response))
- try:
- p.get_tag("title")
- except pullparser.NoMoreTokensError:
- pass
- else:
- self._title = p.get_text()
- return self._title
+ return self._factory.title
def select_form(self, name=None, predicate=None, nr=None):
"""Select an HTML form for input.
This is a bit like giving a form the "input focus" in a browser.
- If a form is selected, the object supports the HTMLForm interface, so
- you can call methods like .set_value(), .set(), and .click().
+ If a form is selected, the Browser object supports the HTMLForm
+ interface, so you can call methods like .set_value(), .set(), and
+ .click().
At least one of the name, predicate and nr arguments must be supplied.
If no matching form is found, mechanize.FormNotFoundError is raised.
@@ -451,7 +376,7 @@
original_scheme = self.request.get_type()
if scheme not in ["http", "https"]:
return request
- if not origin_request and not self.request.has_header('Referer'):
+ if not origin_request and not self.request.has_header("Referer"):
return request
if (self._handle_referer and
@@ -542,7 +467,8 @@
with opening tags "textified" as per the pullparser docs) must compare
equal to this argument, if supplied
text_regex: link text between tag (as defined above) must match the
- regular expression object passed as this argument, if supplied
+ regular expression object or regular expression string passed as this
+ argument, if supplied
name, name_regex: as for text and text_regex, but matched against the
name HTML attribute of the link tag
url, url_regex: as for text and text_regex, but matched against the
@@ -554,7 +480,10 @@
nr: matches the nth link that matches all other criteria (default 0)
"""
- return self._find_links(True, **kwds)
+ try:
+ return self._filter_links(self._factory.links(), **kwds).next()
+ except StopIteration:
+ raise LinkNotFoundError()
def __getattr__(self, name):
# pass through ClientForm / DOMForm methods and attributes
@@ -568,7 +497,7 @@
#---------------------------------------------------
# Private methods.
- def _find_links(self, single,
+ def _filter_links(self, links,
text=None, text_regex=None,
name=None, name_regex=None,
url=None, url_regex=None,
@@ -582,34 +511,22 @@
found_links = []
orig_nr = nr
- # An optimization, so that if we look for a single link we do not have
- # to necessarily parse the entire file.
- if self._links is None and single:
- all_links = self.get_links_iter()
- else:
- if self._links is None:
- try:
- self._links = list(self.get_links_iter())
- finally:
- self._response.seek(0)
- all_links = self._links
-
- for link in all_links:
+ for link in links:
if url is not None and url != link.url:
continue
- if url_regex is not None and not url_regex.search(link.url):
+ if url_regex is not None and not re.search(url_regex, link.url):
continue
if (text is not None and
(link.text is None or text != link.text)):
continue
if (text_regex is not None and
- (link.text is None or not text_regex.search(link.text))):
+ (link.text is None or not re.search(text_regex, link.text))):
continue
if name is not None and name != dict(link.attrs).get("name"):
continue
if name_regex is not None:
link_name = dict(link.attrs).get("name")
- if link_name is None or not name_regex.search(link_name):
+ if link_name is None or not re.search(name_regex, link_name):
continue
if tag is not None and tag != link.tag:
continue
@@ -618,28 +535,5 @@
if nr:
nr -= 1
continue
- if single:
- return link
- else:
- found_links.append(link)
- nr = orig_nr
- if not found_links:
- raise LinkNotFoundError()
- return found_links
-
- def _encoding(self, response):
- # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
- # headers may be in the response. HTTP-EQUIV headers come last,
- # so try in order from first to last.
- for ct in response.info().getheaders("content-type"):
- for k, v in split_header_words([ct])[0]:
- if k == "charset":
- return v
- return self.default_encoding
-
- def _parse_html(self, response):
- # this is now lazy, so we just reset the various attributes that
- # result from parsing
- self.form = None
- self._title = None
- self._forms = self._links = None
+ yield link
+ nr = orig_nr
Added: Zope3/trunk/src/mechanize/_mozillacookiejar.py
===================================================================
--- Zope3/trunk/src/mechanize/_mozillacookiejar.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_mozillacookiejar.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,160 @@
+"""Mozilla / Netscape cookie loading / saving.
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+Copyright 1997-1999 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import re, string, time, logging
+
+from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _util import startswith, endswith
+debug = logging.getLogger("ClientCookie").debug
+
+
+class MozillaCookieJar(FileCookieJar):
+ """
+
+ WARNING: you may want to backup your browser's cookies file if you use
+ this class to save cookies. I *think* it works, but there have been
+ bugs in the past!
+
+ This class differs from CookieJar only in the format it uses to save and
+ load cookies to and from a file. This class uses the Mozilla/Netscape
+ `cookies.txt' format. lynx uses this file format, too.
+
+ Don't expect cookies saved while the browser is running to be noticed by
+ the browser (in fact, Mozilla on unix will overwrite your saved cookies if
+ you change them on disk while it's running; on Windows, you probably can't
+ save at all while the browser is running).
+
+ Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
+ Netscape cookies on saving.
+
+ In particular, the cookie version and port number information is lost,
+ together with information about whether or not Path, Port and Discard were
+ specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
+ domain as set in the HTTP header started with a dot (yes, I'm aware some
+ domains in Netscape files start with a dot and some don't -- trust me, you
+ really don't want to know any more about this).
+
+ Note that though Mozilla and Netscape use the same format, they use
+ slightly different headers. The class saves cookies using the Netscape
+ header by default (Mozilla can cope with that).
+
+ """
+ magic_re = "#( Netscape)? HTTP Cookie File"
+ header = """\
+ # Netscape HTTP Cookie File
+ # http://www.netscape.com/newsref/std/cookie_spec.html
+ # This is a generated file! Do not edit.
+
+"""
+
+ def _really_load(self, f, filename, ignore_discard, ignore_expires):
+ now = time.time()
+
+ magic = f.readline()
+ if not re.search(self.magic_re, magic):
+ f.close()
+ raise LoadError(
+ "%s does not look like a Netscape format cookies file" %
+ filename)
+
+ try:
+ while 1:
+ line = f.readline()
+ if line == "": break
+
+ # last field may be absent, so keep any trailing tab
+ if endswith(line, "\n"): line = line[:-1]
+
+ # skip comments and blank lines XXX what is $ for?
+ if (startswith(string.strip(line), "#") or
+ startswith(string.strip(line), "$") or
+ string.strip(line) == ""):
+ continue
+
+ domain, domain_specified, path, secure, expires, name, value = \
+ string.split(line, "\t")
+ secure = (secure == "TRUE")
+ domain_specified = (domain_specified == "TRUE")
+ if name == "":
+ name = value
+ value = None
+
+ initial_dot = startswith(domain, ".")
+ assert domain_specified == initial_dot
+
+ discard = False
+ if expires == "":
+ expires = None
+ discard = True
+
+ # assume path_specified is false
+ c = Cookie(0, name, value,
+ None, False,
+ domain, domain_specified, initial_dot,
+ path, False,
+ secure,
+ expires,
+ discard,
+ None,
+ None,
+ {})
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ self.set_cookie(c)
+
+ except:
+ reraise_unmasked_exceptions((IOError,))
+ raise LoadError("invalid Netscape format file %s: %s" %
+ (filename, line))
+
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ f = open(filename, "w")
+ try:
+ debug("Saving Netscape cookies.txt file")
+ f.write(self.header)
+ now = time.time()
+ for cookie in self:
+ if not ignore_discard and cookie.discard:
+ debug(" Not saving %s: marked for discard", cookie.name)
+ continue
+ if not ignore_expires and cookie.is_expired(now):
+ debug(" Not saving %s: expired", cookie.name)
+ continue
+ if cookie.secure: secure = "TRUE"
+ else: secure = "FALSE"
+ if startswith(cookie.domain, "."): initial_dot = "TRUE"
+ else: initial_dot = "FALSE"
+ if cookie.expires is not None:
+ expires = str(cookie.expires)
+ else:
+ expires = ""
+ if cookie.value is None:
+ # cookies.txt regards 'Set-Cookie: foo' as a cookie
+ # with no name, whereas cookielib regards it as a
+ # cookie with no value.
+ name = ""
+ value = cookie.name
+ else:
+ name = cookie.name
+ value = cookie.value
+ f.write(
+ string.join([cookie.domain, initial_dot, cookie.path,
+ secure, expires, name, value], "\t")+
+ "\n")
+ finally:
+ f.close()
Property changes on: Zope3/trunk/src/mechanize/_mozillacookiejar.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_msiecookiejar.py
===================================================================
--- Zope3/trunk/src/mechanize/_msiecookiejar.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_msiecookiejar.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,388 @@
+"""Microsoft Internet Explorer cookie loading on Windows.
+
+Copyright 2002-2003 Johnny Lee <typo_pl at hotmail.com> (MSIE Perl code)
+Copyright 2002-2006 John J Lee <jjl at pobox.com> (The Python port)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+# XXX names and comments are not great here
+
+import os, re, string, time, struct, logging
+if os.name == "nt":
+ import _winreg
+
+from _clientcookie import FileCookieJar, CookieJar, Cookie, \
+ MISSING_FILENAME_TEXT, LoadError
+from _util import startswith
+
+debug = logging.getLogger("mechanize").debug
+
+
+def regload(path, leaf):
+ key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
+ _winreg.KEY_ALL_ACCESS)
+ try:
+ value = _winreg.QueryValueEx(key, leaf)[0]
+ except WindowsError:
+ value = None
+ return value
+
+WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
+
+def epoch_time_offset_from_win32_filetime(filetime):
+ """Convert from win32 filetime to seconds-since-epoch value.
+
+ MSIE stores create and expire times as Win32 FILETIME, which is 64
+ bits of 100 nanosecond intervals since Jan 01 1601.
+
+ mechanize expects time in 32-bit value expressed in seconds since the
+ epoch (Jan 01 1970).
+
+ """
+ if filetime < WIN32_EPOCH:
+ raise ValueError("filetime (%d) is before epoch (%d)" %
+ (filetime, WIN32_EPOCH))
+
+ return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
+
+def binary_to_char(c): return "%02X" % ord(c)
+def binary_to_str(d): return string.join(map(binary_to_char, list(d)), "")
+
+class MSIEBase:
+ magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
+ padding = "\x0d\xf0\xad\x0b"
+
+ msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
+ cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
+ "(.+\@[\x21-\xFF]+\.txt)")
+
+ # path under HKEY_CURRENT_USER from which to get location of index.dat
+ reg_path = r"software\microsoft\windows" \
+ r"\currentversion\explorer\shell folders"
+ reg_key = "Cookies"
+
+ def __init__(self):
+ self._delayload_domains = {}
+
+ def _delayload_domain(self, domain):
+ # if necessary, lazily load cookies for this domain
+ delayload_info = self._delayload_domains.get(domain)
+ if delayload_info is not None:
+ cookie_file, ignore_discard, ignore_expires = delayload_info
+ try:
+ self.load_cookie_data(cookie_file,
+ ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ debug("error reading cookie file, skipping: %s", cookie_file)
+ else:
+ del self._delayload_domains[domain]
+
+ def _load_cookies_from_file(self, filename):
+ debug("Loading MSIE cookies file: %s", filename)
+ cookies = []
+
+ cookies_fh = open(filename)
+
+ try:
+ while 1:
+ key = cookies_fh.readline()
+ if key == "": break
+
+ rl = cookies_fh.readline
+ def getlong(rl=rl): return long(rl().rstrip())
+ def getstr(rl=rl): return rl().rstrip()
+
+ key = key.rstrip()
+ value = getstr()
+ domain_path = getstr()
+ flags = getlong() # 0x2000 bit is for secure I think
+ lo_expire = getlong()
+ hi_expire = getlong()
+ lo_create = getlong()
+ hi_create = getlong()
+ sep = getstr()
+
+ if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
+ hi_create, lo_create, sep) or (sep != "*"):
+ break
+
+ m = self.msie_domain_re.search(domain_path)
+ if m:
+ domain = m.group(1)
+ path = m.group(2)
+
+ cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain,
+ "PATH": path, "FLAGS": flags, "HIXP": hi_expire,
+ "LOXP": lo_expire, "HICREATE": hi_create,
+ "LOCREATE": lo_create})
+ finally:
+ cookies_fh.close()
+
+ return cookies
+
+ def load_cookie_data(self, filename,
+ ignore_discard=False, ignore_expires=False):
+ """Load cookies from file containing actual cookie data.
+
+ Old cookies are kept unless overwritten by newly loaded ones.
+
+ You should not call this method if the delayload attribute is set.
+
+ I think each of these files contain all cookies for one user, domain,
+ and path.
+
+ filename: file containing cookies -- usually found in a file like
+ C:\WINNT\Profiles\joe\Cookies\joe at blah[1].txt
+
+ """
+ now = int(time.time())
+
+ cookie_data = self._load_cookies_from_file(filename)
+
+ for cookie in cookie_data:
+ flags = cookie["FLAGS"]
+ secure = ((flags & 0x2000) != 0)
+ filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
+ expires = epoch_time_offset_from_win32_filetime(filetime)
+ if expires < now:
+ discard = True
+ else:
+ discard = False
+ domain = cookie["DOMAIN"]
+ initial_dot = startswith(domain, ".")
+ if initial_dot:
+ domain_specified = True
+ else:
+ # MSIE 5 does not record whether the domain cookie-attribute
+ # was specified.
+ # Assuming it wasn't is conservative, because with strict
+ # domain matching this will match less frequently; with regular
+ # Netscape tail-matching, this will match at exactly the same
+ # times that domain_specified = True would. It also means we
+ # don't have to prepend a dot to achieve consistency with our
+ # own & Mozilla's domain-munging scheme.
+ domain_specified = False
+
+ # assume path_specified is false
+ # XXX is there other stuff in here? -- eg. comment, commentURL?
+ c = Cookie(0,
+ cookie["KEY"], cookie["VALUE"],
+ None, False,
+ domain, domain_specified, initial_dot,
+ cookie["PATH"], False,
+ secure,
+ expires,
+ discard,
+ None,
+ None,
+ {"flags": flags})
+ if not ignore_discard and c.discard:
+ continue
+ if not ignore_expires and c.is_expired(now):
+ continue
+ CookieJar.set_cookie(self, c)
+
+ def load_from_registry(self, ignore_discard=False, ignore_expires=False,
+ username=None):
+ """
+ username: only required on win9x
+
+ """
+ cookies_dir = regload(self.reg_path, self.reg_key)
+ filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
+ self.load(filename, ignore_discard, ignore_expires, username)
+
+ def _really_load(self, index, filename, ignore_discard, ignore_expires,
+ username):
+ now = int(time.time())
+
+ if username is None:
+ username = string.lower(os.environ['USERNAME'])
+
+ cookie_dir = os.path.dirname(filename)
+
+ data = index.read(256)
+ if len(data) != 256:
+ raise LoadError("%s file is too short" % filename)
+
+ # Cookies' index.dat file starts with 32 bytes of signature
+ # followed by an offset to the first record, stored as a little-
+ # endian DWORD.
+ sig, size, data = data[:32], data[32:36], data[36:]
+ size = struct.unpack("<L", size)[0]
+
+ # check that sig is valid
+ if not self.magic_re.match(sig) or size != 0x4000:
+ raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
+ (str(filename), sig, size))
+
+ # skip to start of first record
+ index.seek(size, 0)
+
+ sector = 128 # size of sector in bytes
+
+ while 1:
+ data = ""
+
+ # Cookies are usually in two contiguous sectors, so read in two
+ # sectors and adjust if not a Cookie.
+ to_read = 2 * sector
+ d = index.read(to_read)
+ if len(d) != to_read:
+ break
+ data = data + d
+
+ # Each record starts with a 4-byte signature and a count
+ # (little-endian DWORD) of sectors for the record.
+ sig, size, data = data[:4], data[4:8], data[8:]
+ size = struct.unpack("<L", size)[0]
+
+ to_read = (size - 2) * sector
+
+## from urllib import quote
+## print "data", quote(data)
+## print "sig", quote(sig)
+## print "size in sectors", size
+## print "size in bytes", size*sector
+## print "size in units of 16 bytes", (size*sector) / 16
+## print "size to read in bytes", to_read
+## print
+
+ if sig != "URL ":
+ assert (sig in ("HASH", "LEAK",
+ self.padding, "\x00\x00\x00\x00"),
+ "unrecognized MSIE index.dat record: %s" %
+ binary_to_str(sig))
+ if sig == "\x00\x00\x00\x00":
+ # assume we've got all the cookies, and stop
+ break
+ if sig == self.padding:
+ continue
+ # skip the rest of this record
+ assert to_read >= 0
+ if size != 2:
+ assert to_read != 0
+ index.seek(to_read, 1)
+ continue
+
+ # read in rest of record if necessary
+ if size > 2:
+ more_data = index.read(to_read)
+ if len(more_data) != to_read: break
+ data = data + more_data
+
+ cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
+ "(%s\@[\x21-\xFF]+\.txt)" % username)
+ m = re.search(cookie_re, data, re.I)
+ if m:
+ cookie_file = os.path.join(cookie_dir, m.group(2))
+ if not self.delayload:
+ try:
+ self.load_cookie_data(cookie_file,
+ ignore_discard, ignore_expires)
+ except (LoadError, IOError):
+ debug("error reading cookie file, skipping: %s",
+ cookie_file)
+ else:
+ domain = m.group(1)
+ i = domain.find("/")
+ if i != -1:
+ domain = domain[:i]
+
+ self._delayload_domains[domain] = (
+ cookie_file, ignore_discard, ignore_expires)
+
+
+class MSIECookieJar(MSIEBase, FileCookieJar):
+ """FileCookieJar that reads from the Windows MSIE cookies database.
+
+ MSIECookieJar can read the cookie files of Microsoft Internet Explorer
+ (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
+ Windows 98. Other configurations may also work, but are untested. Saving
+ cookies in MSIE format is NOT supported. If you save cookies, they'll be
+ in the usual Set-Cookie3 format, which you can read back in using an
+ instance of the plain old CookieJar class. Don't save using the same
+ filename that you loaded cookies from, because you may succeed in
+ clobbering your MSIE cookies index file!
+
+ You should be able to have LWP share Internet Explorer's cookies like
+ this (note you need to supply a username to load_from_registry if you're on
+ Windows 9x or Windows ME):
+
+ cj = MSIECookieJar(delayload=1)
+ # find cookies index file in registry and load cookies from it
+ cj.load_from_registry()
+ opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
+ response = opener.open("http://example.com/")
+
+ Iterating over a delayloaded MSIECookieJar instance will not cause any
+ cookies to be read from disk. To force reading of all cookies from disk,
+ call read_all_cookies. Note that the following methods iterate over self:
+ clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
+ and as_string.
+
+ Additional methods:
+
+ load_from_registry(ignore_discard=False, ignore_expires=False,
+ username=None)
+ load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
+ read_all_cookies()
+
+ """
+ def __init__(self, filename=None, delayload=False, policy=None):
+ MSIEBase.__init__(self)
+ FileCookieJar.__init__(self, filename, delayload, policy)
+
+ def set_cookie(self, cookie):
+ if self.delayload:
+ self._delayload_domain(cookie.domain)
+ CookieJar.set_cookie(self, cookie)
+
+ def _cookies_for_request(self, request):
+ """Return a list of cookies to be returned to server."""
+ domains = self._cookies.copy()
+ domains.update(self._delayload_domains)
+ domains = domains.keys()
+
+ cookies = []
+ for domain in domains:
+ cookies.extend(self._cookies_for_domain(domain, request))
+ return cookies
+
+ def _cookies_for_domain(self, domain, request):
+ if not self._policy.domain_return_ok(domain, request):
+ return []
+ debug("Checking %s for cookies to return", domain)
+ if self.delayload:
+ self._delayload_domain(domain)
+ return CookieJar._cookies_for_domain(self, domain, request)
+
+ def read_all_cookies(self):
+ """Eagerly read in all cookies."""
+ if self.delayload:
+ for domain in self._delayload_domains.keys():
+ self._delayload_domain(domain)
+
+ def load(self, filename, ignore_discard=False, ignore_expires=False,
+ username=None):
+ """Load cookies from an MSIE 'index.dat' cookies index file.
+
+ filename: full path to cookie index file
+ username: only required on win9x
+
+ """
+ if filename is None:
+ if self.filename is not None: filename = self.filename
+ else: raise ValueError(MISSING_FILENAME_TEXT)
+
+ index = open(filename, "rb")
+
+ try:
+ self._really_load(index, filename, ignore_discard, ignore_expires,
+ username)
+ finally:
+ index.close()
Property changes on: Zope3/trunk/src/mechanize/_msiecookiejar.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_opener.py
===================================================================
--- Zope3/trunk/src/mechanize/_opener.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_opener.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,273 @@
+"""Integration with Python standard library module urllib2: OpenerDirector
+class.
+
+Copyright 2004-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import urllib2, string, bisect, urlparse
+
+from _util import startswith, isstringlike
+from _request import Request
+
+try:
+ set
+except NameError:
+ import sets
+ set = sets.Set
+
+def methnames(obj):
+ """Return method names of class instance.
+
+ dir(obj) doesn't work across Python versions, this does.
+
+ """
+ return methnames_of_instance_as_dict(obj).keys()
+
+def methnames_of_instance_as_dict(inst):
+ names = {}
+ names.update(methnames_of_class_as_dict(inst.__class__))
+ for methname in dir(inst):
+ try:
+ candidate = getattr(inst, methname)
+ except AttributeError:
+ continue
+ if callable(candidate):
+ names[methname] = None
+ return names
+
+def methnames_of_class_as_dict(klass):
+ names = {}
+ for methname in dir(klass):
+ try:
+ candidate = getattr(klass, methname)
+ except AttributeError:
+ continue
+ if callable(candidate):
+ names[methname] = None
+ for baseclass in klass.__bases__:
+ names.update(methnames_of_class_as_dict(baseclass))
+ return names
+
+
+class OpenerDirector(urllib2.OpenerDirector):
+ def __init__(self):
+ urllib2.OpenerDirector.__init__(self)
+ # really none of these are (sanely) public -- the lack of initial
+ # underscore on some is just due to following urllib2
+ self.process_response = {}
+ self.process_request = {}
+ self._any_request = {}
+ self._any_response = {}
+ self._handler_index_valid = True
+
+ def add_handler(self, handler):
+ if handler in self.handlers:
+ return
+ # XXX why does self.handlers need to be sorted?
+ bisect.insort(self.handlers, handler)
+ handler.add_parent(self)
+ self._handler_index_valid = False
+
+ def _maybe_reindex_handlers(self):
+ if self._handler_index_valid:
+ return
+
+ handle_error = {}
+ handle_open = {}
+ process_request = {}
+ process_response = {}
+ any_request = set()
+ any_response = set()
+ unwanted = []
+
+ for handler in self.handlers:
+ added = False
+ for meth in methnames(handler):
+ if meth in ["redirect_request", "do_open", "proxy_open"]:
+ # oops, coincidental match
+ continue
+
+ if meth == "any_request":
+ any_request.add(handler)
+ added = True
+ continue
+ elif meth == "any_response":
+ any_response.add(handler)
+ added = True
+ continue
+
+ ii = meth.find("_")
+ scheme = meth[:ii]
+ condition = meth[ii+1:]
+
+ if startswith(condition, "error"):
+ jj = string.find(meth[ii+1:], "_") + ii + 1
+ kind = meth[jj+1:]
+ try:
+ kind = int(kind)
+ except ValueError:
+ pass
+ lookup = handle_error.setdefault(scheme, {})
+ elif condition == "open":
+ kind = scheme
+ lookup = handle_open
+ elif condition == "request":
+ kind = scheme
+ lookup = process_request
+ elif condition == "response":
+ kind = scheme
+ lookup = process_response
+ else:
+ continue
+
+ lookup.setdefault(kind, set()).add(handler)
+ added = True
+
+ if not added:
+ unwanted.append(handler)
+
+ for handler in unwanted:
+ self.handlers.remove(handler)
+
+ # sort indexed methods
+ # XXX could be cleaned up
+ for lookup in [process_request, process_response]:
+ for scheme, handlers in lookup.iteritems():
+ lookup[scheme] = handlers
+ for scheme, lookup in handle_error.iteritems():
+ for code, handlers in lookup.iteritems():
+ handlers = list(handlers)
+ handlers.sort()
+ lookup[code] = handlers
+ for scheme, handlers in handle_open.iteritems():
+ handlers = list(handlers)
+ handlers.sort()
+ handle_open[scheme] = handlers
+
+ # cache the indexes
+ self.handle_error = handle_error
+ self.handle_open = handle_open
+ self.process_request = process_request
+ self.process_response = process_response
+ self._any_request = any_request
+ self._any_response = any_response
+
+ def _request(self, url_or_req, data):
+ if isstringlike(url_or_req):
+ req = Request(url_or_req, data)
+ else:
+ # already a urllib2.Request or mechanize.Request instance
+ req = url_or_req
+ if data is not None:
+ req.add_data(data)
+ return req
+
+ def open(self, fullurl, data=None):
+ req = self._request(fullurl, data)
+ req_scheme = req.get_type()
+
+ self._maybe_reindex_handlers()
+
+ # pre-process request
+ # XXX should we allow a Processor to change the URL scheme
+ # of the request?
+ request_processors = set(self.process_request.get(req_scheme, []))
+ request_processors.update(self._any_request)
+ request_processors = list(request_processors)
+ request_processors.sort()
+ for processor in request_processors:
+ for meth_name in ["any_request", req_scheme+"_request"]:
+ meth = getattr(processor, meth_name, None)
+ if meth:
+ req = meth(req)
+
+ # In Python >= 2.4, .open() supports processors already, so we must
+ # call ._open() instead.
+ urlopen = getattr(urllib2.OpenerDirector, "_open",
+ urllib2.OpenerDirector.open)
+ response = urlopen(self, req, data)
+
+ # post-process response
+ response_processors = set(self.process_response.get(req_scheme, []))
+ response_processors.update(self._any_response)
+ response_processors = list(response_processors)
+ response_processors.sort()
+ for processor in response_processors:
+ for meth_name in ["any_response", req_scheme+"_response"]:
+ meth = getattr(processor, meth_name, None)
+ if meth:
+ response = meth(req, response)
+
+ return response
+
+ def error(self, proto, *args):
+ if proto in ['http', 'https']:
+ # XXX http[s] protocols are special-cased
+ dict = self.handle_error['http'] # https is not different than http
+ proto = args[2] # YUCK!
+ meth_name = 'http_error_%s' % proto
+ http_err = 1
+ orig_args = args
+ else:
+ dict = self.handle_error
+ meth_name = proto + '_error'
+ http_err = 0
+ args = (dict, proto, meth_name) + args
+ result = apply(self._call_chain, args)
+ if result:
+ return result
+
+ if http_err:
+ args = (dict, 'default', 'http_error_default') + orig_args
+ return apply(self._call_chain, args)
+
+ def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
+ """Returns (filename, headers).
+
+ For remote objects, the default filename will refer to a temporary
+ file.
+
+ """
+ req = self._request(fullurl, data)
+ type_ = req.get_type()
+ fp = self.open(req)
+ headers = fp.info()
+ if filename is None and type == 'file':
+ return url2pathname(req.get_selector()), headers
+ if filename:
+ tfp = open(filename, 'wb')
+ else:
+ path = urlparse(fullurl)[2]
+ suffix = os.path.splitext(path)[1]
+ tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+ result = filename, headers
+ bs = 1024*8
+ size = -1
+ read = 0
+ blocknum = 1
+ if reporthook:
+ if headers.has_key("content-length"):
+ size = int(headers["Content-Length"])
+ reporthook(0, bs, size)
+ while 1:
+ block = fp.read(bs)
+ read += len(block)
+ if reporthook:
+ reporthook(blocknum, bs, size)
+ blocknum = blocknum + 1
+ if not block:
+ break
+ tfp.write(block)
+ fp.close()
+ tfp.close()
+ del fp
+ del tfp
+ if size>=0 and read<size:
+ raise IOError("incomplete retrieval error",
+ "got only %d bytes out of %d" % (read,size))
+ return result
Property changes on: Zope3/trunk/src/mechanize/_opener.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_pullparser.py
===================================================================
--- Zope3/trunk/src/mechanize/_pullparser.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_pullparser.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,334 @@
+"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
+
+Examples
+
+This program extracts all links from a document. It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+for token in p.tags("a"):
+ if token.type == "endtag": continue
+ url = dict(token.attrs).get("href", "-")
+ text = p.get_compressed_text(endat=("endtag", "a"))
+ print "%s\t%s" % (url, text)
+
+This program extracts the <TITLE> from the document:
+
+import pullparser, sys
+f = file(sys.argv[1])
+p = pullparser.PullParser(f)
+if p.get_tag("title"):
+ title = p.get_compressed_text()
+ print "Title: %s" % title
+
+
+Copyright 2003-2006 John J. Lee <jjl at pobox.com>
+Copyright 1998-2001 Gisle Aas (original libwww-perl code)
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses.
+
+"""
+
+import re, htmlentitydefs
+import sgmllib, HTMLParser
+
+from _html import unescape, unescape_charref
+
+
+class NoMoreTokensError(Exception): pass
+
+class Token:
+ """Represents an HTML tag, declaration, processing instruction etc.
+
+ Behaves as both a tuple-like object (ie. iterable) and has attributes
+ .type, .data and .attrs.
+
+ >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
+ >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
+ True
+ >>> (t.type, t.data) == ("starttag", "a")
+ True
+ >>> t.attrs == [("href", "http://www.python.org/")]
+ True
+
+ Public attributes
+
+ type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
+ "data", "comment", "decl", "pi", after the corresponding methods of
+ HTMLParser.HTMLParser
+ data: For a tag, the tag name; otherwise, the relevant data carried by the
+ tag, as a string
+ attrs: list of (name, value) pairs representing HTML attributes
+ (or None if token does not represent an opening tag)
+
+ """
+ def __init__(self, type, data, attrs=None):
+ self.type = type
+ self.data = data
+ self.attrs = attrs
+ def __iter__(self):
+ return iter((self.type, self.data, self.attrs))
+ def __eq__(self, other):
+ type, data, attrs = other
+ if (self.type == type and
+ self.data == data and
+ self.attrs == attrs):
+ return True
+ else:
+ return False
+ def __ne__(self, other): return not self.__eq__(other)
+ def __repr__(self):
+ args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
+ return self.__class__.__name__+"(%s)" % args
+
+def iter_until_exception(fn, exception, *args, **kwds):
+ while 1:
+ try:
+ yield fn(*args, **kwds)
+ except exception:
+ raise StopIteration
+
+
+class _AbstractParser:
+ chunk = 1024
+ compress_re = re.compile(r"\s+")
+ def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
+ encoding="ascii", entitydefs=None):
+ """
+ fh: file-like object (only a .read() method is required) from which to
+ read HTML to be parsed
+ textify: mapping used by .get_text() and .get_compressed_text() methods
+ to represent opening tags as text
+ encoding: encoding used to encode numeric character references by
+ .get_text() and .get_compressed_text() ("ascii" by default)
+
+ entitydefs: mapping like {"amp": "&", ...} containing HTML entity
+ definitions (a sensible default is used). This is used to unescape
+ entities in .get_text() (and .get_compressed_text()) and attribute
+ values. If the encoding can not represent the character, the entity
+ reference is left unescaped. Note that entity references (both
+ numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
+ unescaped in attribute values and the return value of .get_text(), but
+ not in data outside of tags. Instead, entity references outside of
+ tags are represented as tokens. This is a bit odd, it's true :-/
+
+ If the element name of an opening tag matches a key in the textify
+ mapping then that tag is converted to text. The corresponding value is
+ used to specify which tag attribute to obtain the text from. textify
+ maps from element names to either:
+
+ - an HTML attribute name, in which case the HTML attribute value is
+ used as its text value along with the element name in square
+ brackets (eg."alt text goes here[IMG]", or, if the alt attribute
+ were missing, just "[IMG]")
+ - a callable object (eg. a function) which takes a Token and returns
+ the string to be used as its text value
+
+ If textify has no key for an element name, nothing is substituted for
+ the opening tag.
+
+ Public attributes:
+
+ encoding and textify: see above
+
+ """
+ self._fh = fh
+ self._tokenstack = [] # FIFO
+ self.textify = textify
+ self.encoding = encoding
+ if entitydefs is None:
+ entitydefs = htmlentitydefs.name2codepoint
+ self._entitydefs = entitydefs
+
+ def __iter__(self): return self
+
+ def tags(self, *names):
+ return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
+
+ def tokens(self, *tokentypes):
+ return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
+
+ def next(self):
+ try:
+ return self.get_token()
+ except NoMoreTokensError:
+ raise StopIteration()
+
+ def get_token(self, *tokentypes):
+ """Pop the next Token object from the stack of parsed tokens.
+
+ If arguments are given, they are taken to be token types in which the
+ caller is interested: tokens representing other elements will be
+ skipped. Element names must be given in lower case.
+
+ Raises NoMoreTokensError.
+
+ """
+ while 1:
+ while self._tokenstack:
+ token = self._tokenstack.pop(0)
+ if tokentypes:
+ if token.type in tokentypes:
+ return token
+ else:
+ return token
+ data = self._fh.read(self.chunk)
+ if not data:
+ raise NoMoreTokensError()
+ self.feed(data)
+
+ def unget_token(self, token):
+ """Push a Token back onto the stack."""
+ self._tokenstack.insert(0, token)
+
+ def get_tag(self, *names):
+ """Return the next Token that represents an opening or closing tag.
+
+ If arguments are given, they are taken to be element names in which the
+ caller is interested: tags representing other elements will be skipped.
+ Element names must be given in lower case.
+
+ Raises NoMoreTokensError.
+
+ """
+ while 1:
+ tok = self.get_token()
+ if tok.type not in ["starttag", "endtag", "startendtag"]:
+ continue
+ if names:
+ if tok.data in names:
+ return tok
+ else:
+ return tok
+
+ def get_text(self, endat=None):
+ """Get some text.
+
+ endat: stop reading text at this tag (the tag is included in the
+ returned text); endtag is a tuple (type, name) where type is
+ "starttag", "endtag" or "startendtag", and name is the element name of
+ the tag (element names must be given in lower case)
+
+ If endat is not given, .get_text() will stop at the next opening or
+ closing tag, or when there are no more tokens (no exception is raised).
+ Note that .get_text() includes the text representation (if any) of the
+ opening tag, but pushes the opening tag back onto the stack. As a
+ result, if you want to call .get_text() again, you need to call
+ .get_tag() first (unless you want an empty string returned when you
+ next call .get_text()).
+
+ Entity references are translated using the value of the entitydefs
+ constructor argument (a mapping from names to characters like that
+ provided by the standard module htmlentitydefs). Named entity
+ references that are not in this mapping are left unchanged.
+
+ The textify attribute is used to translate opening tags into text: see
+ the class docstring.
+
+ """
+ text = []
+ tok = None
+ while 1:
+ try:
+ tok = self.get_token()
+ except NoMoreTokensError:
+ # unget last token (not the one we just failed to get)
+ if tok: self.unget_token(tok)
+ break
+ if tok.type == "data":
+ text.append(tok.data)
+ elif tok.type == "entityref":
+ t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
+ text.append(t)
+ elif tok.type == "charref":
+ t = unescape_charref(tok.data, self.encoding)
+ text.append(t)
+ elif tok.type in ["starttag", "endtag", "startendtag"]:
+ tag_name = tok.data
+ if tok.type in ["starttag", "startendtag"]:
+ alt = self.textify.get(tag_name)
+ if alt is not None:
+ if callable(alt):
+ text.append(alt(tok))
+ elif tok.attrs is not None:
+ for k, v in tok.attrs:
+ if k == alt:
+ text.append(v)
+ text.append("[%s]" % tag_name.upper())
+ if endat is None or endat == (tok.type, tag_name):
+ self.unget_token(tok)
+ break
+ return "".join(text)
+
+ def get_compressed_text(self, *args, **kwds):
+ """
+ As .get_text(), but collapses each group of contiguous whitespace to a
+ single space character, and removes all initial and trailing
+ whitespace.
+
+ """
+ text = self.get_text(*args, **kwds)
+ text = text.strip()
+ return self.compress_re.sub(" ", text)
+
+ def handle_startendtag(self, tag, attrs):
+ self._tokenstack.append(Token("startendtag", tag, attrs))
+ def handle_starttag(self, tag, attrs):
+ self._tokenstack.append(Token("starttag", tag, attrs))
+ def handle_endtag(self, tag):
+ self._tokenstack.append(Token("endtag", tag))
+ def handle_charref(self, name):
+ self._tokenstack.append(Token("charref", name))
+ def handle_entityref(self, name):
+ self._tokenstack.append(Token("entityref", name))
+ def handle_data(self, data):
+ self._tokenstack.append(Token("data", data))
+ def handle_comment(self, data):
+ self._tokenstack.append(Token("comment", data))
+ def handle_decl(self, decl):
+ self._tokenstack.append(Token("decl", decl))
+ def unknown_decl(self, data):
+ # XXX should this call self.error instead?
+ #self.error("unknown declaration: " + `data`)
+ self._tokenstack.append(Token("decl", data))
+ def handle_pi(self, data):
+ self._tokenstack.append(Token("pi", data))
+
+ def unescape_attr(self, name):
+ return unescape(name, self._entitydefs, self.encoding)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ escaped_attrs.append((key, self.unescape_attr(val)))
+ return escaped_attrs
+
+class PullParser(_AbstractParser, HTMLParser.HTMLParser):
+ def __init__(self, *args, **kwds):
+ HTMLParser.HTMLParser.__init__(self)
+ _AbstractParser.__init__(self, *args, **kwds)
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
+ def __init__(self, *args, **kwds):
+ sgmllib.SGMLParser.__init__(self)
+ _AbstractParser.__init__(self, *args, **kwds)
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.unescape_attrs(attrs)
+ self._tokenstack.append(Token("starttag", tag, attrs))
+ def unknown_endtag(self, tag):
+ self._tokenstack.append(Token("endtag", tag))
+
+
+def _test():
+ import doctest, _pullparser
+ return doctest.testmod(_pullparser)
+
+if __name__ == "__main__":
+ _test()
Property changes on: Zope3/trunk/src/mechanize/_pullparser.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_request.py
===================================================================
--- Zope3/trunk/src/mechanize/_request.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_request.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,68 @@
+"""Integration with Python standard library module urllib2: Request class.
+
+Copyright 2004-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import urllib2, string
+
+from _clientcookie import request_host
+
+
+class Request(urllib2.Request):
+ def __init__(self, url, data=None, headers={},
+ origin_req_host=None, unverifiable=False):
+ urllib2.Request.__init__(self, url, data, headers)
+ self.unredirected_hdrs = {}
+
+ # All the terminology below comes from RFC 2965.
+ self.unverifiable = unverifiable
+ # Set request-host of origin transaction.
+ # The origin request-host is needed in order to decide whether
+ # unverifiable sub-requests (automatic redirects, images embedded
+ # in HTML, etc.) are to third-party hosts. If they are, the
+ # resulting transactions might need to be conducted with cookies
+ # turned off.
+ if origin_req_host is None:
+ origin_req_host = request_host(self)
+ self.origin_req_host = origin_req_host
+
+ def get_origin_req_host(self):
+ return self.origin_req_host
+
+ def is_unverifiable(self):
+ return self.unverifiable
+
+ def add_unredirected_header(self, key, val):
+ """Add a header that will not be added to a redirected request."""
+ self.unredirected_hdrs[string.capitalize(key)] = val
+
+ def has_header(self, header_name):
+ """True iff request has named header (regular or unredirected)."""
+ if (self.headers.has_key(header_name) or
+ self.unredirected_hdrs.has_key(header_name)):
+ return True
+ return False
+
+ def get_header(self, header_name, default=None):
+ return self.headers.get(
+ header_name,
+ self.unredirected_hdrs.get(header_name, default))
+
+ def header_items(self):
+ hdrs = self.unredirected_hdrs.copy()
+ hdrs.update(self.headers)
+ return hdrs.items()
+
+ def __str__(self):
+ return "<Request for %s>" % self.get_full_url()
+
+ def get_method(self):
+ if self.has_data():
+ return "POST"
+ else:
+ return "GET"
Property changes on: Zope3/trunk/src/mechanize/_request.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_urllib2.py
===================================================================
--- Zope3/trunk/src/mechanize/_urllib2.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_urllib2.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,53 @@
+# urllib2 work-alike interface
+# ...from urllib2...
+from urllib2 import \
+ URLError, \
+ HTTPError, \
+ GopherError, \
+ HTTPPasswordMgr, \
+ HTTPPasswordMgrWithDefaultRealm, \
+ AbstractBasicAuthHandler, \
+ AbstractDigestAuthHandler
+# ...and from mechanize
+from _opener import OpenerDirector
+from _auth import \
+ HTTPProxyPasswordMgr, \
+ ProxyHandler, \
+ ProxyBasicAuthHandler, \
+ ProxyDigestAuthHandler, \
+ HTTPBasicAuthHandler, \
+ HTTPDigestAuthHandler
+from _urllib2_support import \
+ Request, \
+ build_opener, install_opener, urlopen, \
+ OpenerFactory, urlretrieve, \
+ RobotExclusionError
+
+# handlers...
+# ...from urllib2...
+from urllib2 import \
+ BaseHandler, \
+ HTTPDefaultErrorHandler, \
+ UnknownHandler, \
+ FTPHandler, \
+ CacheFTPHandler, \
+ FileHandler, \
+ GopherHandler
+# ...and from mechanize
+from _urllib2_support import \
+ HTTPHandler, \
+ HTTPRedirectHandler, \
+ HTTPRequestUpgradeProcessor, \
+ HTTPEquivProcessor, \
+ SeekableProcessor, \
+ HTTPCookieProcessor, \
+ HTTPRefererProcessor, \
+ HTTPRefreshProcessor, \
+ HTTPErrorProcessor, \
+ HTTPResponseDebugProcessor, \
+ HTTPRedirectDebugProcessor, \
+ HTTPRobotRulesProcessor
+import httplib
+if hasattr(httplib, 'HTTPS'):
+ from _urllib2_support import HTTPSHandler
+del httplib
Property changes on: Zope3/trunk/src/mechanize/_urllib2.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: Zope3/trunk/src/mechanize/_urllib2_support.py
===================================================================
--- Zope3/trunk/src/mechanize/_urllib2_support.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_urllib2_support.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,718 @@
+"""Integration with Python standard library module urllib2.
+
+Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
+the META HTTP-EQUIV tag contents, and following Refresh header redirects.
+
+Copyright 2002-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import copy, time, tempfile, htmlentitydefs, re, logging, types, \
+ string, socket, urlparse, urllib2, urllib, httplib, sgmllib
+from urllib2 import URLError, HTTPError, BaseHandler
+from cStringIO import StringIO
+try:
+ import threading as _threading
+except ImportError:
+ import dummy_threading as _threading
+
+import _opener
+from _request import Request
+from _util import isstringlike, startswith, \
+ getheaders, closeable_response, response_seek_wrapper
+from _html import unescape, unescape_charref
+from _headersutil import is_html
+from _clientcookie import CookieJar, request_host
+
+debug = logging.getLogger("mechanize.cookies").debug
+
+
+CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
+DEFAULT_ENCODING = 'latin-1'
+
+
+# This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
+# (http://www.python.org/sf/549151)
+# 2.2.3 is broken here (my fault!), 2.3 is fixed.
+class HTTPRedirectHandler(BaseHandler):
+ # maximum number of redirections to any single URL
+ # this is needed because of the state that cookies introduce
+ max_repeats = 4
+ # maximum total number of redirections (regardless of URL) before
+ # assuming we're in a loop
+ max_redirections = 10
+
+ # Implementation notes:
+
+ # To avoid the server sending us into an infinite loop, the request
+ # object needs to track what URLs we have already seen. Do this by
+ # adding a handler-specific attribute to the Request object. The value
+ # of the dict is used to count the number of times the same URL has
+ # been visited. This is needed because visiting the same URL twice
+ # does not necessarily imply a loop, thanks to state introduced by
+ # cookies.
+
+ # Always unhandled redirection codes:
+ # 300 Multiple Choices: should not handle this here.
+ # 304 Not Modified: no need to handle here: only of interest to caches
+ # that do conditional GETs
+ # 305 Use Proxy: probably not worth dealing with here
+ # 306 Unused: what was this for in the previous versions of protocol??
+
+ def redirect_request(self, newurl, req, fp, code, msg, headers):
+ """Return a Request or None in response to a redirect.
+
+ This is called by the http_error_30x methods when a redirection
+ response is received. If a redirection should take place, return a
+ new Request to allow http_error_30x to perform the redirect;
+ otherwise, return None to indicate that an HTTPError should be
+ raised.
+
+ """
+ if code in (301, 302, 303, "refresh") or \
+ (code == 307 and not req.has_data()):
+ # Strictly (according to RFC 2616), 301 or 302 in response to
+ # a POST MUST NOT cause a redirection without confirmation
+ # from the user (of urllib2, in this case). In practice,
+ # essentially all clients do redirect in this case, so we do
+ # the same.
+ return Request(newurl,
+ headers=req.headers,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
+ else:
+ raise HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+ def http_error_302(self, req, fp, code, msg, headers):
+ # Some servers (incorrectly) return multiple Location headers
+ # (so probably same goes for URI). Use first header.
+ if headers.has_key('location'):
+ newurl = getheaders(headers, 'location')[0]
+ elif headers.has_key('uri'):
+ newurl = getheaders(headers, 'uri')[0]
+ else:
+ return
+ newurl = urlparse.urljoin(req.get_full_url(), newurl)
+
+ # XXX Probably want to forget about the state of the current
+ # request, although that might interact poorly with other
+ # handlers that also use handler-specific request attributes
+ new = self.redirect_request(newurl, req, fp, code, msg, headers)
+ if new is None:
+ return
+
+ # loop detection
+ # .redirect_dict has a key url if url was previously visited.
+ if hasattr(req, 'redirect_dict'):
+ visited = new.redirect_dict = req.redirect_dict
+ if (visited.get(newurl, 0) >= self.max_repeats or
+ len(visited) >= self.max_redirections):
+ raise HTTPError(req.get_full_url(), code,
+ self.inf_msg + msg, headers, fp)
+ else:
+ visited = new.redirect_dict = req.redirect_dict = {}
+ visited[newurl] = visited.get(newurl, 0) + 1
+
+ # Don't close the fp until we are sure that we won't use it
+ # with HTTPError.
+ fp.read()
+ fp.close()
+
+ return self.parent.open(new)
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_302
+ http_error_refresh = http_error_302
+
+ inf_msg = "The HTTP server returned a redirect error that would " \
+ "lead to an infinite loop.\n" \
+ "The last 30x error message was:\n"
+
+
+class HTTPRequestUpgradeProcessor(BaseHandler):
+ # upgrade urllib2.Request to this module's Request
+ # yuck!
+ handler_order = 0 # before anything else
+
+ def http_request(self, request):
+ if not hasattr(request, "add_unredirected_header"):
+ newrequest = Request(request._Request__original, request.data,
+ request.headers)
+ try: newrequest.origin_req_host = request.origin_req_host
+ except AttributeError: pass
+ try: newrequest.unverifiable = request.unverifiable
+ except AttributeError: pass
+ request = newrequest
+ return request
+
+ https_request = http_request
+
+# XXX would self.reset() work, instead of raising this exception?
+class EndOfHeadError(Exception): pass
+class AbstractHeadParser:
+ # only these elements are allowed in or before HEAD of document
+ head_elems = ("html", "head",
+ "title", "base",
+ "script", "style", "meta", "link", "object")
+ _entitydefs = htmlentitydefs.name2codepoint
+ _encoding = DEFAULT_ENCODING
+
+ def __init__(self):
+ self.http_equiv = []
+
+ def start_meta(self, attrs):
+ http_equiv = content = None
+ for key, value in attrs:
+ if key == "http-equiv":
+ http_equiv = self.unescape_attr_if_required(value)
+ elif key == "content":
+ content = self.unescape_attr_if_required(value)
+ if http_equiv is not None:
+ self.http_equiv.append((http_equiv, content))
+
+ def end_head(self):
+ raise EndOfHeadError()
+
+ def handle_entityref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape(
+ '&%s;' % name, self._entitydefs, self._encoding))
+
+ def handle_charref(self, name):
+ #debug("%s", name)
+ self.handle_data(unescape_charref(name, self._encoding))
+
+ def unescape_attr(self, name):
+ #debug("%s", name)
+ return unescape(name, self._entitydefs, self._encoding)
+
+ def unescape_attrs(self, attrs):
+ #debug("%s", attrs)
+ escaped_attrs = {}
+ for key, val in attrs.items():
+ escaped_attrs[key] = self.unescape_attr(val)
+ return escaped_attrs
+
+ def unknown_entityref(self, ref):
+ self.handle_data("&%s;" % ref)
+
+ def unknown_charref(self, ref):
+ self.handle_data("&#%s;" % ref)
+
+
+try:
+ import HTMLParser
+except ImportError:
+ pass
+else:
+ class XHTMLCompatibleHeadParser(AbstractHeadParser,
+ HTMLParser.HTMLParser):
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'start_' + tag)
+ except AttributeError:
+ try:
+ method = getattr(self, 'do_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method(attrs)
+ else:
+ method(attrs)
+
+ def handle_endtag(self, tag):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ try:
+ method = getattr(self, 'end_' + tag)
+ except AttributeError:
+ pass # unknown tag
+ else:
+ method()
+
+ def unescape(self, name):
+ # Use the entitydefs passed into constructor, not
+ # HTMLParser.HTMLParser's entitydefs.
+ return self.unescape_attr(name)
+
+ def unescape_attr_if_required(self, name):
+ return name # HTMLParser.HTMLParser already did it
+
+class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
+
+ def _not_called(self):
+ assert False
+
+ def __init__(self):
+ sgmllib.SGMLParser.__init__(self)
+ AbstractHeadParser.__init__(self)
+
+ def handle_starttag(self, tag, method, attrs):
+ if tag not in self.head_elems:
+ raise EndOfHeadError()
+ if tag == "meta":
+ method(attrs)
+
+ def unknown_starttag(self, tag, attrs):
+ self.handle_starttag(tag, self._not_called, attrs)
+
+ def handle_endtag(self, tag, method):
+ if tag in self.head_elems:
+ method()
+ else:
+ raise EndOfHeadError()
+
+ def unescape_attr_if_required(self, name):
+ return self.unescape_attr(name)
+
+def parse_head(fileobj, parser):
+ """Return a list of key, value pairs."""
+ while 1:
+ data = fileobj.read(CHUNK)
+ try:
+ parser.feed(data)
+ except EndOfHeadError:
+ break
+ if len(data) != CHUNK:
+ # this should only happen if there is no HTML body, or if
+ # CHUNK is big
+ break
+ return parser.http_equiv
+
+class HTTPEquivProcessor(BaseHandler):
+ """Append META HTTP-EQUIV headers to regular HTTP headers."""
+
+ handler_order = 300 # before handlers that look at HTTP headers
+
+ def __init__(self, head_parser_class=HeadParser,
+ i_want_broken_xhtml_support=False,
+ ):
+ self.head_parser_class = head_parser_class
+ self._allow_xhtml = i_want_broken_xhtml_support
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ headers = response.info()
+ url = response.geturl()
+ ct_hdrs = getheaders(response.info(), "content-type")
+ if is_html(ct_hdrs, url, self._allow_xhtml):
+ try:
+ try:
+ html_headers = parse_head(response, self.head_parser_class())
+ finally:
+ response.seek(0)
+ except (HTMLParser.HTMLParseError,
+ sgmllib.SGMLParseError):
+ pass
+ else:
+ for hdr, val in html_headers:
+ # rfc822.Message interprets this as appending, not clobbering
+ headers[hdr] = val
+ return response
+
+ https_response = http_response
+
+class SeekableProcessor(BaseHandler):
+ """Make responses seekable."""
+
+ def any_response(self, request, response):
+ if not hasattr(response, "seek"):
+ return response_seek_wrapper(response)
+ return response
+
+class HTTPCookieProcessor(BaseHandler):
+ """Handle HTTP cookies.
+
+ Public attributes:
+
+ cookiejar: CookieJar instance
+
+ """
+ def __init__(self, cookiejar=None):
+ if cookiejar is None:
+ cookiejar = CookieJar()
+ self.cookiejar = cookiejar
+
+ def http_request(self, request):
+ self.cookiejar.add_cookie_header(request)
+ return request
+
+ def http_response(self, request, response):
+ self.cookiejar.extract_cookies(response, request)
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+try:
+ import robotparser
+except ImportError:
+ pass
+else:
+ class RobotExclusionError(urllib2.HTTPError):
+ def __init__(self, request, *args):
+ apply(urllib2.HTTPError.__init__, (self,)+args)
+ self.request = request
+
+ class HTTPRobotRulesProcessor(BaseHandler):
+ # before redirections, after everything else
+ handler_order = 800
+
+ try:
+ from httplib import HTTPMessage
+ except:
+ from mimetools import Message
+ http_response_class = Message
+ else:
+ http_response_class = HTTPMessage
+
+ def __init__(self, rfp_class=robotparser.RobotFileParser):
+ self.rfp_class = rfp_class
+ self.rfp = None
+ self._host = None
+
+ def http_request(self, request):
+ host = request.get_host()
+ scheme = request.get_type()
+ if host != self._host:
+ self.rfp = self.rfp_class()
+ self.rfp.set_url(scheme+"://"+host+"/robots.txt")
+ self.rfp.read()
+ self._host = host
+
+ ua = request.get_header("User-agent", "")
+ if self.rfp.can_fetch(ua, request.get_full_url()):
+ return request
+ else:
+ msg = "request disallowed by robots.txt"
+ raise RobotExclusionError(
+ request,
+ request.get_full_url(),
+ 403, msg,
+ self.http_response_class(StringIO()), StringIO(msg))
+
+ https_request = http_request
+
+class HTTPRefererProcessor(BaseHandler):
+ """Add Referer header to requests.
+
+ This only makes sense if you use each RefererProcessor for a single
+ chain of requests only (so, for example, if you use a single
+ HTTPRefererProcessor to fetch a series of URLs extracted from a single
+ page, this will break).
+
+ There's a proper implementation of this in module mechanize.
+
+ """
+ def __init__(self):
+ self.referer = None
+
+ def http_request(self, request):
+ if ((self.referer is not None) and
+ not request.has_header("Referer")):
+ request.add_unredirected_header("Referer", self.referer)
+ return request
+
+ def http_response(self, request, response):
+ self.referer = response.geturl()
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+class HTTPResponseDebugProcessor(BaseHandler):
+ handler_order = 900 # before redirections, after everything else
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = response_seek_wrapper(response)
+ info = getLogger("mechanize.http_responses").info
+ try:
+ info(response.read())
+ finally:
+ response.seek(0)
+ info("*****************************************************")
+ return response
+
+ https_response = http_response
+
+class HTTPRedirectDebugProcessor(BaseHandler):
+ def http_request(self, request):
+ if hasattr(request, "redirect_dict"):
+ info = getLogger("mechanize.http_redirects").info
+ info("redirecting to %s", request.get_full_url())
+ return request
+
+class HTTPRefreshProcessor(BaseHandler):
+ """Perform HTTP Refresh redirections.
+
+ Note that if a non-200 HTTP code has occurred (for example, a 30x
+ redirect), this processor will do nothing.
+
+ By default, only zero-time Refresh headers are redirected. Use the
+ max_time attribute / constructor argument to allow Refresh with longer
+ pauses. Use the honor_time attribute / constructor argument to control
+ whether the requested pause is honoured (with a time.sleep()) or
+ skipped in favour of immediate redirection.
+
+ Public attributes:
+
+ max_time: see above
+ honor_time: see above
+
+ """
+ handler_order = 1000
+
+ def __init__(self, max_time=0, honor_time=True):
+ self.max_time = max_time
+ self.honor_time = honor_time
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code == 200 and hdrs.has_key("refresh"):
+ refresh = getheaders(hdrs, "refresh")[0]
+ ii = string.find(refresh, ";")
+ if ii != -1:
+ pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
+ jj = string.find(newurl_spec, "=")
+ if jj != -1:
+ key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
+ if key.strip().lower() != "url":
+ debug("bad Refresh header: %r" % refresh)
+ return response
+ else:
+ pause, newurl = float(refresh), response.geturl()
+ if (self.max_time is None) or (pause <= self.max_time):
+ if pause > 1E-3 and self.honor_time:
+ time.sleep(pause)
+ hdrs["location"] = newurl
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response,
+ "refresh", msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+class HTTPErrorProcessor(BaseHandler):
+ """Process HTTP error responses.
+
+ The purpose of this handler is to to allow other response processors a
+ look-in by removing the call to parent.error() from
+ AbstractHTTPHandler.
+
+ For non-200 error codes, this just passes the job on to the
+ Handler.<proto>_error_<code> methods, via the OpenerDirector.error
+ method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
+ HTTPError if no other handler handles the error.
+
+ """
+ handler_order = 1000 # after all other processors
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code != 200:
+ # hardcoded http is NOT a bug
+ response = self.parent.error(
+ "http", request, response, code, msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+
+class AbstractHTTPHandler(BaseHandler):
+
+ def __init__(self, debuglevel=0):
+ self._debuglevel = debuglevel
+
+ def set_http_debuglevel(self, level):
+ self._debuglevel = level
+
+ def do_request_(self, request):
+ host = request.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ if request.has_data(): # POST
+ data = request.get_data()
+ if not request.has_header('Content-type'):
+ request.add_unredirected_header(
+ 'Content-type',
+ 'application/x-www-form-urlencoded')
+
+ scheme, sel = urllib.splittype(request.get_selector())
+ sel_host, sel_path = urllib.splithost(sel)
+ if not request.has_header('Host'):
+ request.add_unredirected_header('Host', sel_host or host)
+ for name, value in self.parent.addheaders:
+ name = string.capitalize(name)
+ if not request.has_header(name):
+ request.add_unredirected_header(name, value)
+
+ return request
+
+ def do_open(self, http_class, req):
+ """Return an addinfourl object for the request, using http_class.
+
+ http_class must implement the HTTPConnection API from httplib.
+ The addinfourl return value is a file-like object. It also
+ has methods and attributes including:
+ - info(): return a mimetools.Message object for the headers
+ - geturl(): return the original request URL
+ - code: HTTP status code
+ """
+ host = req.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ h = http_class(host) # will parse host:port
+ h.set_debuglevel(self._debuglevel)
+
+ headers = req.headers.copy()
+ headers.update(req.unredirected_hdrs)
+ # We want to make an HTTP/1.1 request, but the addinfourl
+ # class isn't prepared to deal with a persistent connection.
+ # It will try to read all remaining data from the socket,
+ # which will block while the server waits for the next request.
+ # So make sure the connection gets closed after the (only)
+ # request.
+ headers["Connection"] = "close"
+ try:
+ h.request(req.get_method(), req.get_selector(), req.data, headers)
+ r = h.getresponse()
+ except socket.error, err: # XXX what error?
+ raise URLError(err)
+
+ # Pick apart the HTTPResponse object to get the addinfourl
+ # object initialized properly.
+
+ # Wrap the HTTPResponse object in socket's file object adapter
+ # for Windows. That adapter calls recv(), so delegate recv()
+ # to read(). This weird wrapping allows the returned object to
+ # have readline() and readlines() methods.
+
+ # XXX It might be better to extract the read buffering code
+ # out of socket._fileobject() and into a base class.
+
+ r.recv = r.read
+ fp = socket._fileobject(r, 'rb', -1)
+
+ resp = closeable_response(fp, r.msg, req.get_full_url(),
+ r.status, r.reason)
+ return resp
+
+
+class HTTPHandler(AbstractHTTPHandler):
+ def http_open(self, req):
+ return self.do_open(httplib.HTTPConnection, req)
+
+ http_request = AbstractHTTPHandler.do_request_
+
+if hasattr(httplib, 'HTTPS'):
+ class HTTPSHandler(AbstractHTTPHandler):
+ def https_open(self, req):
+ return self.do_open(httplib.HTTPSConnection, req)
+
+ https_request = AbstractHTTPHandler.do_request_
+
+class OpenerFactory:
+ """This class's interface is quite likely to change."""
+
+ default_classes = [
+ # handlers
+ urllib2.ProxyHandler,
+ urllib2.UnknownHandler,
+ HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
+ urllib2.HTTPDefaultErrorHandler,
+ HTTPRedirectHandler, # from this module (bugfixed)
+ urllib2.FTPHandler,
+ urllib2.FileHandler,
+ # processors
+ HTTPRequestUpgradeProcessor,
+ HTTPCookieProcessor,
+ HTTPErrorProcessor
+ ]
+ handlers = []
+ replacement_handlers = []
+
+ def __init__(self, klass=_opener.OpenerDirector):
+ self.klass = klass
+
+ def build_opener(self, *handlers):
+ """Create an opener object from a list of handlers and processors.
+
+ The opener will use several default handlers and processors, including
+ support for HTTP and FTP.
+
+ If any of the handlers passed as arguments are subclasses of the
+ default handlers, the default handlers will not be used.
+
+ """
+ opener = self.klass()
+ default_classes = list(self.default_classes)
+ if hasattr(httplib, 'HTTPS'):
+ default_classes.append(HTTPSHandler)
+ skip = []
+ for klass in default_classes:
+ for check in handlers:
+ if type(check) == types.ClassType:
+ if issubclass(check, klass):
+ skip.append(klass)
+ elif type(check) == types.InstanceType:
+ if isinstance(check, klass):
+ skip.append(klass)
+ for klass in skip:
+ default_classes.remove(klass)
+
+ for klass in default_classes:
+ opener.add_handler(klass())
+ for h in handlers:
+ if type(h) == types.ClassType:
+ h = h()
+ opener.add_handler(h)
+
+ return opener
+
+build_opener = OpenerFactory().build_opener
+
+_opener = None
+urlopen_lock = _threading.Lock()
+def urlopen(url, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.open(url, data)
+
+def urlretrieve(url, filename=None, reporthook=None, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.retrieve(url, filename, reporthook, data)
+
+def install_opener(opener):
+ global _opener
+ _opener = opener
Property changes on: Zope3/trunk/src/mechanize/_urllib2_support.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Modified: Zope3/trunk/src/mechanize/_useragent.py
===================================================================
--- Zope3/trunk/src/mechanize/_useragent.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_useragent.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -3,45 +3,24 @@
This is a subclass of urllib2.OpenerDirector.
-Copyright 2003 John J. Lee <jjl at pobox.com>
+Copyright 2003-2006 John J. Lee <jjl at pobox.com>
This code is free software; you can redistribute it and/or modify it under
-the terms of the BSD License (see the file COPYING included with the
-distribution).
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
"""
-import sys
-import urllib2, httplib
-import ClientCookie
-if sys.version_info[:2] >= (2, 4):
- import cookielib
- from urllib2 import OpenerDirector, BaseHandler, \
- HTTPHandler, HTTPErrorProcessor
- try:
- from urllib2 import HTTPSHandler
- except ImportError:
- pass
- class SaneHTTPCookieProcessor(ClientCookie.HTTPCookieProcessor):
- # Workaround for RFC 2109 bug http://python.org/sf/1157027 (at least if
- # you don't pass your own CookieJar in: if that's the case, you should
- # pass rfc2965=True to the DefaultCookiePolicy constructor yourself, or
- # set the corresponding attribute).
- def __init__(self, cookiejar=None):
- if cookiejar is None:
- cookiejar = cookielib.CookieJar(
- cookielib.DefaultCookiePolicy(rfc2965=True))
- self.cookiejar = cookiejar
- HTTPCookieProcessor = SaneHTTPCookieProcessor
-else:
- from ClientCookie import OpenerDirector, BaseHandler, \
- HTTPHandler, HTTPErrorProcessor, HTTPCookieProcessor
- try:
- from ClientCookie import HTTPSHandler
- except ImportError:
- pass
+import sys, warnings, urllib2
-class HTTPRefererProcessor(BaseHandler):
+from _opener import OpenerDirector
+
+import _urllib2
+import _auth
+import _gzip
+
+
+class HTTPRefererProcessor(_urllib2.BaseHandler):
def http_request(self, request):
# See RFC 2616 14.36. The only times we know the source of the
# request URI has a URI associated with it are redirect, and
@@ -76,66 +55,81 @@
handler_classes = {
# scheme handlers
- "http": HTTPHandler,
- "ftp": urllib2.FTPHandler, # CacheFTPHandler is buggy in 2.3
- "file": urllib2.FileHandler,
- "gopher": urllib2.GopherHandler,
- # XXX etc.
+ "http": _urllib2.HTTPHandler,
+ # CacheFTPHandler is buggy, at least in 2.3, so we don't use it
+ "ftp": _urllib2.FTPHandler,
+ "file": _urllib2.FileHandler,
+ "gopher": _urllib2.GopherHandler,
# other handlers
- "_unknown": urllib2.UnknownHandler,
+ "_unknown": _urllib2.UnknownHandler,
# HTTP{S,}Handler depend on HTTPErrorProcessor too
- "_http_error": HTTPErrorProcessor,
- "_http_request_upgrade": ClientCookie.HTTPRequestUpgradeProcessor,
- "_http_default_error": urllib2.HTTPDefaultErrorHandler,
+ "_http_error": _urllib2.HTTPErrorProcessor,
+ "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor,
+ "_http_default_error": _urllib2.HTTPDefaultErrorHandler,
# feature handlers
- "_authen": urllib2.HTTPBasicAuthHandler,
- # XXX rest of authentication stuff
- "_redirect": ClientCookie.HTTPRedirectHandler,
- "_cookies": HTTPCookieProcessor,
- "_refresh": ClientCookie.HTTPRefreshProcessor,
+ "_basicauth": _urllib2.HTTPBasicAuthHandler,
+ "_digestauth": _urllib2.HTTPDigestAuthHandler,
+ "_redirect": _urllib2.HTTPRedirectHandler,
+ "_cookies": _urllib2.HTTPCookieProcessor,
+ "_refresh": _urllib2.HTTPRefreshProcessor,
"_referer": HTTPRefererProcessor, # from this module, note
- "_equiv": ClientCookie.HTTPEquivProcessor,
- "_seek": ClientCookie.SeekableProcessor,
- "_proxy": urllib2.ProxyHandler,
- # XXX there's more to proxies, too
+ "_equiv": _urllib2.HTTPEquivProcessor,
+ "_seek": _urllib2.SeekableProcessor,
+ "_proxy": _urllib2.ProxyHandler,
+ "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
+ "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
+ "_robots": _urllib2.HTTPRobotRulesProcessor,
+ "_gzip": _gzip.HTTPGzipProcessor, # experimental!
# debug handlers
- "_debug_redirect": ClientCookie.HTTPRedirectDebugProcessor,
- "_debug_response_body": ClientCookie.HTTPResponseDebugProcessor,
+ "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
+ "_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
}
default_schemes = ["http", "ftp", "file", "gopher"]
default_others = ["_unknown", "_http_error", "_http_request_upgrade",
- "_http_default_error"]
- default_features = ["_authen", "_redirect", "_cookies", "_refresh",
- "_referer", "_equiv", "_seek", "_proxy"]
- if hasattr(httplib, 'HTTPS'):
- handler_classes["https"] = HTTPSHandler
+ "_http_default_error",
+ ]
+ default_features = ["_redirect", "_cookies", "_referer",
+ "_refresh", "_equiv",
+ "_basicauth", "_digestauth",
+ "_proxy", "_proxy_basicauth", "_proxy_digestauth",
+ "_seek", "_robots",
+ ]
+ if hasattr(_urllib2, 'HTTPSHandler'):
+ handler_classes["https"] = _urllib2.HTTPSHandler
default_schemes.append("https")
- if hasattr(ClientCookie, "HTTPRobotRulesProcessor"):
- handler_classes["_robots"] = ClientCookie.HTTPRobotRulesProcessor
- default_features.append("_robots")
def __init__(self):
OpenerDirector.__init__(self)
- self._ua_handlers = {}
+ ua_handlers = self._ua_handlers = {}
for scheme in (self.default_schemes+
self.default_others+
self.default_features):
klass = self.handler_classes[scheme]
- self._ua_handlers[scheme] = klass()
- for handler in self._ua_handlers.itervalues():
+ ua_handlers[scheme] = klass()
+ for handler in ua_handlers.itervalues():
self.add_handler(handler)
+ # Yuck.
# Ensure correct default constructor args were passed to
- # HTTPRefererProcessor and HTTPEquivProcessor. Yuck.
- if '_refresh' in self._ua_handlers:
+ # HTTPRefererProcessor and HTTPEquivProcessor.
+ if "_refresh" in ua_handlers:
self.set_handle_refresh(True)
- if '_equiv' in self._ua_handlers:
+ if "_equiv" in ua_handlers:
self.set_handle_equiv(True)
+ # Ensure default password managers are installed.
+ pm = ppm = None
+ if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
+ pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
+ if ("_proxy_basicauth" in ua_handlers or
+ "_proxy_digestauth" in ua_handlers):
+ ppm = _auth.HTTPProxyPasswordMgr()
+ self.set_password_manager(pm)
+ self.set_proxy_password_manager(ppm)
# special case, requires extra support from mechanize.Browser
self._handle_referer = True
@@ -154,17 +148,20 @@
## self._ftp_conn_cache = conn_cache
def set_handled_schemes(self, schemes):
- """Set sequence of protocol scheme strings.
+ """Set sequence of URL scheme (protocol) strings.
+ For example: ua.set_handled_schemes(["http", "ftp"])
+
If this fails (with ValueError) because you've passed an unknown
- scheme, the set of handled schemes WILL be updated, but schemes in the
- list that come after the unknown scheme won't be handled.
+ scheme, the set of handled schemes will not be changed.
"""
want = {}
for scheme in schemes:
if scheme.startswith("_"):
- raise ValueError("invalid scheme '%s'" % scheme)
+ raise ValueError("not a scheme '%s'" % scheme)
+ if scheme not in self.handler_classes:
+ raise ValueError("unknown scheme '%s'")
want[scheme] = None
# get rid of scheme handlers we don't want
@@ -176,8 +173,6 @@
del want[scheme] # already got it
# add the scheme handlers that are missing
for scheme in want.keys():
- if scheme not in self.handler_classes:
- raise ValueError("unknown scheme '%s'")
self._set_handler(scheme, True)
def _add_referer_header(self, request, origin_request=True):
@@ -185,13 +180,39 @@
"this class can't do HTTP Referer: use mechanize.Browser instead")
def set_cookiejar(self, cookiejar):
- """Set a ClientCookie.CookieJar, or None."""
+ """Set a mechanize.CookieJar, or None."""
self._set_handler("_cookies", obj=cookiejar)
- def set_credentials(self, credentials):
- """Set a urllib2.HTTPPasswordMgr, or None."""
- # XXX use Greg Stein's httpx instead?
- self._set_handler("_authen", obj=credentials)
+ # XXX could use Greg Stein's httpx for some of this instead?
+ # or httplib2??
+ def set_proxies(self, proxies):
+ """Set a dictionary mapping URL scheme to proxy specification, or None.
+
+ e.g. {"http": "joe:password at myproxy.example.com:3128",
+ "ftp": "proxy.example.com"}
+
+ """
+ self._set_handler("_proxy", obj=proxies)
+
+ def add_password(self, url, user, password, realm=None):
+ self._password_manager.add_password(realm, url, user, password)
+ def add_proxy_password(self, user, password, hostport=None, realm=None):
+ self._proxy_password_manager.add_password(
+ realm, hostport, user, password)
+
+ # the following are rarely useful -- use add_password / add_proxy_password
+ # instead
+ def set_password_manager(self, password_manager):
+ """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
+ self._password_manager = password_manager
+ self._set_handler("_basicauth", obj=password_manager)
+ self._set_handler("_digestauth", obj=password_manager)
+ def set_proxy_password_manager(self, password_manager):
+ """Set a mechanize.HTTPProxyPasswordMgr, or None."""
+ self._proxy_password_manager = password_manager
+ self._set_handler("_proxy_basicauth", obj=password_manager)
+ self._set_handler("_proxy_digestauth", obj=password_manager)
+
# these methods all take a boolean parameter
def set_handle_robots(self, handle):
"""Set whether to observe rules from robots.txt."""
@@ -223,37 +244,38 @@
"""
self._set_handler("_referer", handle)
- self._handle_referer = True
- def set_seekable_responses(self, handle):
- """Make response objects .seek()able."""
- self._set_handler("_seek", handle)
+ self._handle_referer = bool(handle)
+ def set_handle_gzip(self, handle):
+ """Handle gzip transfer encoding.
+
+ """
+ if handle:
+ warnings.warn(
+ "gzip transfer encoding is experimental!", stacklevel=2)
+ self._set_handler("_gzip", handle)
def set_debug_redirects(self, handle):
- """Log information about HTTP redirects.
+ """Log information about HTTP redirects (including refreshes).
- This includes refreshes, which show up as faked 302 redirections at the
- moment.
-
- Logs is performed using module logging. The logger name is
- "ClientCookie.http_redirects". To actually print some debug output,
+ Logging is performed using module logging. The logger name is
+ "mechanize.http_redirects". To actually print some debug output,
eg:
- logger = logging.getLogger("ClientCookie.http_redirects")
- logger.addHandler(logging.StreamHandler())
+ import sys, logging
+ logger = logging.getLogger("mechanize.http_redirects")
+ logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.INFO)
Other logger names relevant to this module:
- "ClientCookie.http_responses"
- "ClientCookie.cookies" (or "cookielib" if running Python 2.4)
+ "mechanize.http_responses"
+ "mechanize.cookies" (or "cookielib" if running Python 2.4)
To turn on everything:
- for logger in [
- logging.getLogger("ClientCookie"),
- logging.getLogger("cookielib"),
- ]:
- logger.addHandler(logging.StreamHandler())
- logger.setLevel(logging.INFO)
+ import sys, logging
+ logger = logging.getLogger("mechanize")
+ logger.addHandler(logging.StreamHandler(sys.stdout))
+ logger.setLevel(logging.INFO)
"""
self._set_handler("_debug_redirect", handle)
@@ -289,29 +311,13 @@
def _replace_handler(self, name, newhandler=None):
# first, if handler was previously added, remove it
if name is not None:
- try:
- handler = self._ua_handlers[name]
- except:
- pass
- else:
- for table in (
- [self.handle_open,
- self.process_request, self.process_response]+
- self.handle_error.values()):
- for handlers in table.values():
- remove(handlers, handler)
- remove(self.handlers, handler)
+ handler = self._ua_handlers.get(name)
+ if handler:
+ try:
+ self.handlers.remove(handler)
+ except ValueError:
+ pass
# then add the replacement, if any
if newhandler is not None:
self.add_handler(newhandler)
self._ua_handlers[name] = newhandler
-
-def remove(sequence, obj):
- # for use when can't use .remove() because of obj.__cmp__ :-(
- # (ClientCookie only requires Python 2.0, which doesn't have __lt__)
- i = 0
- while i < len(sequence):
- if sequence[i] is obj:
- del sequence[i]
- else:
- i += 1
Added: Zope3/trunk/src/mechanize/_util.py
===================================================================
--- Zope3/trunk/src/mechanize/_util.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/mechanize/_util.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -0,0 +1,650 @@
+"""Python backwards-compat., date/time routines, seekable file object wrapper.
+
+ Copyright 2002-2006 John J Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD or ZPL 2.1 licenses (see the file
+COPYING.txt included with the distribution).
+
+"""
+
+import re, string, time, copy, urllib, mimetools
+from types import TupleType
+from cStringIO import StringIO
+
+def startswith(string, initial):
+ if len(initial) > len(string): return False
+ return string[:len(initial)] == initial
+
+def endswith(string, final):
+ if len(final) > len(string): return False
+ return string[-len(final):] == final
+
+def isstringlike(x):
+ try: x+""
+ except: return False
+ else: return True
+
+SPACE_DICT = {}
+for c in string.whitespace:
+ SPACE_DICT[c] = None
+del c
+def isspace(string):
+ for c in string:
+ if not SPACE_DICT.has_key(c): return False
+ return True
+
+## def caller():
+## try:
+## raise SyntaxError
+## except:
+## import sys
+## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
+
+
+# this is here rather than in _HeadersUtil as it's just for
+# compatibility with old Python versions, rather than entirely new code
+def getheaders(msg, name):
+ """Get all values for a header.
+
+ This returns a list of values for headers given more than once; each
+ value in the result list is stripped in the same way as the result of
+ getheader(). If the header is not given, return an empty list.
+ """
+ result = []
+ current = ''
+ have_header = 0
+ for s in msg.getallmatchingheaders(name):
+ if isspace(s[0]):
+ if current:
+ current = "%s\n %s" % (current, string.strip(s))
+ else:
+ current = string.strip(s)
+ else:
+ if have_header:
+ result.append(current)
+ current = string.strip(s[string.find(s, ":") + 1:])
+ have_header = 1
+ if have_header:
+ result.append(current)
+ return result
+
+from calendar import timegm
+
+# Date/time conversion routines for formats used by the HTTP protocol.
+
+EPOCH = 1970
+def my_timegm(tt):
+ year, month, mday, hour, min, sec = tt[:6]
+ if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
+ (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
+ return timegm(tt)
+ else:
+ return None
+
+days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+months_lower = []
+for month in months: months_lower.append(string.lower(month))
+
+
+def time2isoz(t=None):
+ """Return a string representing time in seconds since epoch, t.
+
+ If the function is called without an argument, it will use the current
+ time.
+
+ The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
+ representing Universal Time (UTC, aka GMT). An example of this format is:
+
+ 1994-11-24 08:49:37Z
+
+ """
+ if t is None: t = time.time()
+ year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
+ return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
+ year, mon, mday, hour, min, sec)
+
+def time2netscape(t=None):
+ """Return a string representing time in seconds since epoch, t.
+
+ If the function is called without an argument, it will use the current
+ time.
+
+ The format of the returned string is like this:
+
+ Wed, DD-Mon-YYYY HH:MM:SS GMT
+
+ """
+ if t is None: t = time.time()
+ year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
+ return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
+ days[wday], mday, months[mon-1], year, hour, min, sec)
+
+
+UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
+
+timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
+def offset_from_tz_string(tz):
+ offset = None
+ if UTC_ZONES.has_key(tz):
+ offset = 0
+ else:
+ m = timezone_re.search(tz)
+ if m:
+ offset = 3600 * int(m.group(2))
+ if m.group(3):
+ offset = offset + 60 * int(m.group(3))
+ if m.group(1) == '-':
+ offset = -offset
+ return offset
+
+def _str2time(day, mon, yr, hr, min, sec, tz):
+ # translate month name to number
+ # month numbers start with 1 (January)
+ try:
+ mon = months_lower.index(string.lower(mon))+1
+ except ValueError:
+ # maybe it's already a number
+ try:
+ imon = int(mon)
+ except ValueError:
+ return None
+ if 1 <= imon <= 12:
+ mon = imon
+ else:
+ return None
+
+ # make sure clock elements are defined
+ if hr is None: hr = 0
+ if min is None: min = 0
+ if sec is None: sec = 0
+
+ yr = int(yr)
+ day = int(day)
+ hr = int(hr)
+ min = int(min)
+ sec = int(sec)
+
+ if yr < 1000:
+ # find "obvious" year
+ cur_yr = time.localtime(time.time())[0]
+ m = cur_yr % 100
+ tmp = yr
+ yr = yr + cur_yr - m
+ m = m - tmp
+ if abs(m) > 50:
+ if m > 0: yr = yr + 100
+ else: yr = yr - 100
+
+ # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
+ t = my_timegm((yr, mon, day, hr, min, sec, tz))
+
+ if t is not None:
+ # adjust time using timezone string, to get absolute time since epoch
+ if tz is None:
+ tz = "UTC"
+ tz = string.upper(tz)
+ offset = offset_from_tz_string(tz)
+ if offset is None:
+ return None
+ t = t - offset
+
+ return t
+
+
+strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
+wkday_re = re.compile(
+ r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
+loose_http_re = re.compile(
+ r"""^
+ (\d\d?) # day
+ (?:\s+|[-\/])
+ (\w+) # month
+ (?:\s+|[-\/])
+ (\d+) # year
+ (?:
+ (?:\s+|:) # separator before clock
+ (\d\d?):(\d\d) # hour:min
+ (?::(\d\d))? # optional seconds
+ )? # optional clock
+ \s*
+ ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
+ \s*
+ (?:\(\w+\))? # ASCII representation of timezone in parens.
+ \s*$""", re.X)
+def http2time(text):
+ """Returns time in seconds since epoch of time represented by a string.
+
+ Return value is an integer.
+
+ None is returned if the format of str is unrecognized, the time is outside
+ the representable range, or the timezone string is not recognized. If the
+ string contains no timezone, UTC is assumed.
+
+ The timezone in the string may be numerical (like "-0800" or "+0100") or a
+ string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
+ timezone strings equivalent to UTC (zero offset) are known to the function.
+
+ The function loosely parses the following formats:
+
+ Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
+ Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
+ Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
+ 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
+ 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
+ 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
+
+ The parser ignores leading and trailing whitespace. The time may be
+ absent.
+
+ If the year is given with only 2 digits, the function will select the
+ century that makes the year closest to the current date.
+
+ """
+ # fast exit for strictly conforming string
+ m = strict_re.search(text)
+ if m:
+ g = m.groups()
+ mon = months_lower.index(string.lower(g[1])) + 1
+ tt = (int(g[2]), mon, int(g[0]),
+ int(g[3]), int(g[4]), float(g[5]))
+ return my_timegm(tt)
+
+ # No, we need some messy parsing...
+
+ # clean up
+ text = string.lstrip(text)
+ text = wkday_re.sub("", text, 1) # Useless weekday
+
+ # tz is time zone specifier string
+ day, mon, yr, hr, min, sec, tz = [None]*7
+
+ # loose regexp parse
+ m = loose_http_re.search(text)
+ if m is not None:
+ day, mon, yr, hr, min, sec, tz = m.groups()
+ else:
+ return None # bad format
+
+ return _str2time(day, mon, yr, hr, min, sec, tz)
+
+
+iso_re = re.compile(
+ """^
+ (\d{4}) # year
+ [-\/]?
+ (\d\d?) # numerical month
+ [-\/]?
+ (\d\d?) # day
+ (?:
+ (?:\s+|[-:Tt]) # separator before clock
+ (\d\d?):?(\d\d) # hour:min
+ (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
+ )? # optional clock
+ \s*
+ ([-+]?\d\d?:?(:?\d\d)?
+ |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
+ \s*$""", re.X)
+def iso2time(text):
+ """
+ As for http2time, but parses the ISO 8601 formats:
+
+ 1994-02-03 14:15:29 -0100 -- ISO 8601 format
+ 1994-02-03 14:15:29 -- zone is optional
+ 1994-02-03 -- only date
+ 1994-02-03T14:15:29 -- Use T as separator
+ 19940203T141529Z -- ISO 8601 compact format
+ 19940203 -- only date
+
+ """
+ # clean up
+ text = string.lstrip(text)
+
+ # tz is time zone specifier string
+ day, mon, yr, hr, min, sec, tz = [None]*7
+
+ # loose regexp parse
+ m = iso_re.search(text)
+ if m is not None:
+ # XXX there's an extra bit of the timezone I'm ignoring here: is
+ # this the right thing to do?
+ yr, mon, day, hr, min, sec, tz, _ = m.groups()
+ else:
+ return None # bad format
+
+ return _str2time(day, mon, yr, hr, min, sec, tz)
+
+
+# XXX Andrew Dalke kindly sent me a similar class in response to my request on
+# comp.lang.python, which I then proceeded to lose. I wrote this class
+# instead, but I think he's released his code publicly since, could pinch the
+# tests from it, at least...
+
+# For testing seek_wrapper invariant (note that
+# test_urllib2.HandlerTest.test_seekable is expected to fail when this
+# invariant checking is turned on). The invariant checking is done by module
+# ipdc, which is available here:
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
+## from ipdbc import ContractBase
+## class seek_wrapper(ContractBase):
+class seek_wrapper:
+ """Adds a seek method to a file object.
+
+ This is only designed for seeking on readonly file-like objects.
+
+ Wrapped file-like object must have a read method. The readline method is
+ only supported if that method is present on the wrapped object. The
+ readlines method is always supported. xreadlines and iteration are
+ supported only for Python 2.2 and above.
+
+ Public attribute: wrapped (the wrapped file object).
+
+ WARNING: All other attributes of the wrapped object (ie. those that are not
+ one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
+ are passed through unaltered, which may or may not make sense for your
+ particular file object.
+
+ """
+ # General strategy is to check that cache is full enough, then delegate to
+ # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
+ # position (self.__pos) is maintained independently of the cache, in order
+ # that a single cache may be shared between multiple seek_wrapper objects.
+ # Copying using module copy shares the cache in this way.
+
+ def __init__(self, wrapped):
+ self.wrapped = wrapped
+ self.__have_readline = hasattr(self.wrapped, "readline")
+ self.__cache = StringIO()
+ self.__pos = 0 # seek position
+
+ def invariant(self):
+ # The end of the cache is always at the same place as the end of the
+ # wrapped file.
+ return self.wrapped.tell() == len(self.__cache.getvalue())
+
+ def __getattr__(self, name):
+ wrapped = self.__dict__.get("wrapped")
+ if wrapped:
+ return getattr(wrapped, name)
+ return getattr(self.__class__, name)
+
+ def seek(self, offset, whence=0):
+ assert whence in [0,1,2]
+
+ # how much data, if any, do we need to read?
+ if whence == 2: # 2: relative to end of *wrapped* file
+ if offset < 0: raise ValueError("negative seek offset")
+ # since we don't know yet where the end of that file is, we must
+ # read everything
+ to_read = None
+ else:
+ if whence == 0: # 0: absolute
+ if offset < 0: raise ValueError("negative seek offset")
+ dest = offset
+ else: # 1: relative to current position
+ pos = self.__pos
+ if pos < offset:
+ raise ValueError("seek to before start of file")
+ dest = pos + offset
+ end = len(self.__cache.getvalue())
+ to_read = dest - end
+ if to_read < 0:
+ to_read = 0
+
+ if to_read != 0:
+ self.__cache.seek(0, 2)
+ if to_read is None:
+ assert whence == 2
+ self.__cache.write(self.wrapped.read())
+ self.__pos = self.__cache.tell() - offset
+ else:
+ self.__cache.write(self.wrapped.read(to_read))
+ # Don't raise an exception even if we've seek()ed past the end
+ # of .wrapped, since fseek() doesn't complain in that case.
+ # Also like fseek(), pretend we have seek()ed past the end,
+ # i.e. not:
+ #self.__pos = self.__cache.tell()
+ # but rather:
+ self.__pos = dest
+ else:
+ self.__pos = dest
+
+ def tell(self):
+ return self.__pos
+
+ def __copy__(self):
+ cpy = self.__class__(self.wrapped)
+ cpy.__cache = self.__cache
+ return cpy
+
+ def get_data(self):
+ pos = self.__pos
+ try:
+ self.seek(0)
+ return self.read(-1)
+ finally:
+ self.__pos = pos
+
+ def read(self, size=-1):
+ pos = self.__pos
+ end = len(self.__cache.getvalue())
+ available = end - pos
+
+ # enough data already cached?
+ if size <= available and size != -1:
+ self.__cache.seek(pos)
+ self.__pos = pos+size
+ return self.__cache.read(size)
+
+ # no, so read sufficient data from wrapped file and cache it
+ self.__cache.seek(0, 2)
+ if size == -1:
+ self.__cache.write(self.wrapped.read())
+ else:
+ to_read = size - available
+ assert to_read > 0
+ self.__cache.write(self.wrapped.read(to_read))
+ self.__cache.seek(pos)
+
+ data = self.__cache.read(size)
+ self.__pos = self.__cache.tell()
+ assert self.__pos == pos + len(data)
+ return data
+
+ def readline(self, size=-1):
+ if not self.__have_readline:
+ raise NotImplementedError("no readline method on wrapped object")
+
+ # line we're about to read might not be complete in the cache, so
+ # read another line first
+ pos = self.__pos
+ self.__cache.seek(0, 2)
+ self.__cache.write(self.wrapped.readline())
+ self.__cache.seek(pos)
+
+ data = self.__cache.readline()
+ if size != -1:
+ r = data[:size]
+ self.__pos = pos+size
+ else:
+ r = data
+ self.__pos = pos+len(data)
+ return r
+
+ def readlines(self, sizehint=-1):
+ pos = self.__pos
+ self.__cache.seek(0, 2)
+ self.__cache.write(self.wrapped.read())
+ self.__cache.seek(pos)
+ data = self.__cache.readlines(sizehint)
+ self.__pos = self.__cache.tell()
+ return data
+
+ def __iter__(self): return self
+ def next(self):
+ line = self.readline()
+ if line == "": raise StopIteration
+ return line
+
+ xreadlines = __iter__
+
+ def __repr__(self):
+ return ("<%s at %s whose wrapped object = %r>" %
+ (self.__class__.__name__, hex(id(self)), self.wrapped))
+
+
+class response_seek_wrapper(seek_wrapper):
+
+ """
+ Supports copying response objects and setting response body data.
+
+ """
+
+ def __init__(self, wrapped):
+ seek_wrapper.__init__(self, wrapped)
+ self._headers = self.wrapped.info()
+
+ def __copy__(self):
+ cpy = seek_wrapper.__copy__(self)
+ # copy headers from delegate
+ cpy._headers = copy.copy(self.info())
+ return cpy
+
+ def info(self):
+ return self._headers
+
+ def set_data(self, data):
+ self.seek(0)
+ self.read()
+ self.close()
+ cache = self._seek_wrapper__cache = StringIO()
+ cache.write(data)
+ self.seek(0)
+
+
+class eoffile:
+ # file-like object that always claims to be at end-of-file...
+ def read(self, size=-1): return ""
+ def readline(self, size=-1): return ""
+ def __iter__(self): return self
+ def next(self): return ""
+ def close(self): pass
+
+class eofresponse(eoffile):
+ def __init__(self, url, headers, code, msg):
+ self._url = url
+ self._headers = headers
+ self.code = code
+ self.msg = msg
+ def geturl(self): return self._url
+ def info(self): return self._headers
+
+
+class closeable_response:
+ """Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
+
+ Only supports responses returned by mechanize.HTTPHandler.
+
+ After .close(), the following methods are supported:
+
+ .read()
+ .readline()
+ .readlines()
+ .seek()
+ .tell()
+ .info()
+ .geturl()
+ .__iter__()
+ .next()
+ .close()
+
+ and the following attributes are supported:
+
+ .code
+ .msg
+
+ Also supports pickling (but the stdlib currently does something to prevent
+ it: http://python.org/sf/1144636).
+
+ """
+ # presence of this attr indicates is useable after .close()
+ closeable_response = None
+
+ def __init__(self, fp, headers, url, code, msg):
+ self._set_fp(fp)
+ self._headers = headers
+ self._url = url
+ self.code = code
+ self.msg = msg
+
+ def _set_fp(self, fp):
+ self.fp = fp
+ self.read = self.fp.read
+ self.readline = self.fp.readline
+ if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
+ if hasattr(self.fp, "fileno"):
+ self.fileno = self.fp.fileno
+ else:
+ self.fileno = lambda: None
+ if hasattr(self.fp, "__iter__"):
+ self.__iter__ = self.fp.__iter__
+ if hasattr(self.fp, "next"):
+ self.next = self.fp.next
+
+ def __repr__(self):
+ return '<%s at %s whose fp = %r>' % (
+ self.__class__.__name__, hex(id(self)), self.fp)
+
+ def info(self):
+ return self._headers
+
+ def geturl(self):
+ return self._url
+
+ def close(self):
+ wrapped = self.fp
+ wrapped.close()
+ new_wrapped = eofresponse(
+ self._url, self._headers, self.code, self.msg)
+ self._set_fp(new_wrapped)
+
+ def __getstate__(self):
+ # There are three obvious options here:
+ # 1. truncate
+ # 2. read to end
+ # 3. close socket, pickle state including read position, then open
+ # again on unpickle and use Range header
+ # XXXX um, 4. refuse to pickle unless .close()d. This is better,
+ # actually ("errors should never pass silently"). Pickling doesn't
+ # work anyway ATM, because of http://python.org/sf/1144636 so fix
+ # this later
+
+ # 2 breaks pickle protocol, because one expects the original object
+ # to be left unscathed by pickling. 3 is too complicated and
+ # surprising (and too much work ;-) to happen in a sane __getstate__.
+ # So we do 1.
+
+ state = self.__dict__.copy()
+ new_wrapped = eofresponse(
+ self._url, self._headers, self.code, self.msg)
+ state["wrapped"] = new_wrapped
+ return state
+
+def make_response(data, headers, url, code, msg):
+ """Convenient factory for objects implementing response interface.
+
+ data: string containing response body data
+ headers: sequence of (name, value) pairs
+ url: URL of response
+ code: integer response code (e.g. 200)
+ msg: string response code message (e.g. "OK")
+
+ """
+ hdr_text = []
+ for name_value in headers:
+ hdr_text.append("%s: %s" % name_value)
+ mime_headers = mimetools.Message(StringIO("\n".join(hdr_text)))
+ r = closeable_response(StringIO(data), mime_headers, url, code, msg)
+ return response_seek_wrapper(r)
Property changes on: Zope3/trunk/src/mechanize/_util.py
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Deleted: Zope3/trunk/src/pullparser.py
===================================================================
--- Zope3/trunk/src/pullparser.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/pullparser.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -1,350 +0,0 @@
-"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
-
-Examples
-
-This program extracts all links from a document. It will print one
-line for each link, containing the URL and the textual description
-between the <A>...</A> tags:
-
-import pullparser, sys
-f = file(sys.argv[1])
-p = pullparser.PullParser(f)
-for token in p.tags("a"):
- if token.type == "endtag": continue
- url = dict(token.attrs).get("href", "-")
- text = p.get_compressed_text(endat=("endtag", "a"))
- print "%s\t%s" % (url, text)
-
-This program extracts the <TITLE> from the document:
-
-import pullparser, sys
-f = file(sys.argv[1])
-p = pullparser.PullParser(f)
-if p.get_tag("title"):
- title = p.get_compressed_text()
- print "Title: %s" % title
-
-
-Copyright 2003-2004 John J. Lee <jjl at pobox.com>
-Copyright 1998-2001 Gisle Aas (original libwww-perl code)
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License.
-
-"""
-
-from __future__ import generators
-
-import re, htmlentitydefs
-import HTMLParser
-
-__version__ = (0, 0, 6, None, None) # 0.0.6b
-
-class NoMoreTokensError(Exception): pass
-
-class Token:
- """Represents an HTML tag, declaration, processing instruction etc.
-
- Behaves as both a tuple-like object (ie. iterable) and has attributes
- .type, .data and .attrs.
-
- >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
- >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
- True
- >>> t.type, t.data == "starttag", "a"
- True
- >>> t.attrs == [("href", "http://www.python.org/")]
- True
-
- Public attributes
-
- type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
- "data", "comment", "decl", "pi", after the corresponding methods of
- HTMLParser.HTMLParser
- data: For a tag, the tag name; otherwise, the relevant data carried by the
- tag, as a string
- attrs: list of (name, value) pairs representing HTML attributes
- (or None if token does not represent an opening tag)
-
- """
- def __init__(self, type, data, attrs=None):
- self.type = type
- self.data = data
- self.attrs = attrs
- def __iter__(self):
- return iter((self.type, self.data, self.attrs))
- def __eq__(self, other):
- type, data, attrs = other
- if (self.type == type and
- self.data == data and
- self.attrs == attrs):
- return True
- else:
- return False
- def __ne__(self, other): return not self.__eq__(other)
- def __repr__(self):
- args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
- return self.__class__.__name__+"(%s)" % args
-
-def iter_until_exception(fn, exception, *args, **kwds):
- while 1:
- try:
- yield fn(*args, **kwds)
- except exception:
- raise StopIteration
-
-def caller():
- try:
- raise SyntaxError
- except:
- import sys
- return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
-
-def unescape(data, entities):
- if data is None or '&' not in data:
- return data
- def replace_entities(match):
- ent = match.group()
- repl = entities.get(ent, ent)
- return repl
- return re.sub(r'&\S+?;', replace_entities, data)
-
-def get_entitydefs():
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- entitydefs["&%s;" % name] = char
- return entitydefs
-
-
-class _AbstractParser:
- chunk = 1024
- compress_re = re.compile(r"\s+")
- entitydefs = htmlentitydefs.entitydefs
- def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
- encoding="ascii", entitydefs=None):
- """
- fh: file-like object (only a .read() method is required) from which to
- read HTML to be parsed
- textify: mapping used by .get_text() and .get_compressed_text() methods
- to represent opening tags as text
- encoding: encoding used to encode numeric character references by
- .get_text() and .get_compressed_text() ("ascii" by default)
- entitydefs: mapping like {'&': '&', ...} containing HTML entity
- definitions (a sensible default is used)
-
- If the element name of an opening tag matches a key in the textify
- mapping then that tag is converted to text. The corresponding value is
- used to specify which tag attribute to obtain the text from. textify
- maps from element names to either:
-
- - an HTML attribute name, in which case the HTML attribute value is
- used as its text value along with the element name in square
- brackets (eg."alt text goes here[IMG]", or, if the alt attribute
- were missing, just "[IMG]")
- - a callable object (eg. a function) which takes a Token and returns
- the string to be used as its text value
-
- If textify has no key for an element name, nothing is substituted for
- the opening tag.
-
- Public attributes:
-
- encoding and textify: see above
-
- """
- self._fh = fh
- self._tokenstack = [] # FIFO
- self.textify = textify
- self.encoding = encoding
- if entitydefs is None:
- entitydefs = get_entitydefs()
- self._entitydefs = entitydefs
-
- def __iter__(self): return self
-
- def tags(self, *names):
- return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
-
- def tokens(self, *tokentypes):
- return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
-
- def next(self):
- try:
- return self.get_token()
- except NoMoreTokensError:
- raise StopIteration()
-
- def get_token(self, *tokentypes):
- """Pop the next Token object from the stack of parsed tokens.
-
- If arguments are given, they are taken to be token types in which the
- caller is interested: tokens representing other elements will be
- skipped. Element names must be given in lower case.
-
- Raises NoMoreTokensError.
-
- """
- while 1:
- while self._tokenstack:
- token = self._tokenstack.pop(0)
- if tokentypes:
- if token.type in tokentypes:
- return token
- else:
- return token
- data = self._fh.read(self.chunk)
- if not data:
- raise NoMoreTokensError()
- self.feed(data)
-
- def unget_token(self, token):
- """Push a Token back onto the stack."""
- self._tokenstack.insert(0, token)
-
- def get_tag(self, *names):
- """Return the next Token that represents an opening or closing tag.
-
- If arguments are given, they are taken to be element names in which the
- caller is interested: tags representing other elements will be skipped.
- Element names must be given in lower case.
-
- Raises NoMoreTokensError.
-
- """
- while 1:
- tok = self.get_token()
- if tok.type not in ["starttag", "endtag", "startendtag"]:
- continue
- if names:
- if tok.data in names:
- return tok
- else:
- return tok
-
- def get_text(self, endat=None):
- """Get some text.
-
- endat: stop reading text at this tag (the tag is included in the
- returned text); endtag is a tuple (type, name) where type is
- "starttag", "endtag" or "startendtag", and name is the element name of
- the tag (element names must be given in lower case)
-
- If endat is not given, .get_text() will stop at the next opening or
- closing tag, or when there are no more tokens (no exception is raised).
- Note that .get_text() includes the text representation (if any) of the
- opening tag, but pushes the opening tag back onto the stack. As a
- result, if you want to call .get_text() again, you need to call
- .get_tag() first (unless you want an empty string returned when you
- next call .get_text()).
-
- Entity references are translated using the entitydefs attribute (a
- mapping from names to characters like that provided by the standard
- module htmlentitydefs). Named entity references that are not in this
- mapping are left unchanged.
-
- The textify attribute is used to translate opening tags into text: see
- the class docstring.
-
- """
- text = []
- tok = None
- while 1:
- try:
- tok = self.get_token()
- except NoMoreTokensError:
- # unget last token (not the one we just failed to get)
- if tok: self.unget_token(tok)
- break
- if tok.type == "data":
- text.append(tok.data)
- elif tok.type == "entityref":
- name = tok.data
- if name in self.entitydefs:
- t = self.entitydefs[name]
- else:
- t = "&%s;" % name
- text.append(t)
- elif tok.type == "charref":
- name, base = tok.data, 10
- if name.startswith('x'):
- name, base= name[1:], 16
- t = unichr(int(name, base)).encode(self.encoding)
- text.append(t)
- elif tok.type in ["starttag", "endtag", "startendtag"]:
- tag_name = tok.data
- if tok.type in ["starttag", "startendtag"]:
- alt = self.textify.get(tag_name)
- if alt is not None:
- if callable(alt):
- text.append(alt(tok))
- elif tok.attrs is not None:
- for k, v in tok.attrs:
- if k == alt:
- text.append(v)
- text.append("[%s]" % tag_name.upper())
- if endat is None or endat == (tok.type, tag_name):
- self.unget_token(tok)
- break
- return "".join(text)
-
- def get_compressed_text(self, *args, **kwds):
- """
- As .get_text(), but collapses each group of contiguous whitespace to a
- single space character, and removes all initial and trailing
- whitespace.
-
- """
- text = self.get_text(*args, **kwds)
- text = text.strip()
- return self.compress_re.sub(" ", text)
-
- def handle_startendtag(self, tag, attrs):
- self._tokenstack.append(Token("startendtag", tag, attrs))
- def handle_starttag(self, tag, attrs):
- self._tokenstack.append(Token("starttag", tag, attrs))
- def handle_endtag(self, tag):
- self._tokenstack.append(Token("endtag", tag))
- def handle_charref(self, name):
- self._tokenstack.append(Token("charref", name))
- def handle_entityref(self, name):
- self._tokenstack.append(Token("entityref", name))
- def handle_data(self, data):
- self._tokenstack.append(Token("data", data))
- def handle_comment(self, data):
- self._tokenstack.append(Token("comment", data))
- def handle_decl(self, decl):
- self._tokenstack.append(Token("decl", decl))
- def unknown_decl(self, data):
- # XXX should this call self.error instead?
- #self.error("unknown declaration: " + `data`)
- self._tokenstack.append(Token("decl", data))
- def handle_pi(self, data):
- self._tokenstack.append(Token("pi", data))
-
- def unescape_attr(self, name):
- return unescape(name, self._entitydefs)
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- escaped_attrs.append((key, self.unescape_attr(val)))
- return escaped_attrs
-
-class PullParser(_AbstractParser, HTMLParser.HTMLParser):
- def __init__(self, *args, **kwds):
- HTMLParser.HTMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
-
-import sgmllib
-class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
- def __init__(self, *args, **kwds):
- sgmllib.SGMLParser.__init__(self)
- _AbstractParser.__init__(self, *args, **kwds)
- def unknown_starttag(self, tag, attrs):
- attrs = self.unescape_attrs(attrs)
- self._tokenstack.append(Token("starttag", tag, attrs))
- def unknown_endtag(self, tag):
- self._tokenstack.append(Token("endtag", tag))
Modified: Zope3/trunk/src/zope/app/apidoc/browser/README.txt
===================================================================
--- Zope3/trunk/src/zope/app/apidoc/browser/README.txt 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/app/apidoc/browser/README.txt 2006-06-21 21:54:07 UTC (rev 68784)
@@ -26,7 +26,7 @@
... except HTTPError, error:
... pass
- >>> print error.read()
+ >>> print browser.contents
<...
<h1 class="details-header">
Page Not Found
Modified: Zope3/trunk/src/zope/testbrowser/DEPENDENCIES.cfg
===================================================================
--- Zope3/trunk/src/zope/testbrowser/DEPENDENCIES.cfg 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/testbrowser/DEPENDENCIES.cfg 2006-06-21 21:54:07 UTC (rev 68784)
@@ -1,7 +1,5 @@
-ClientCookie
ClientForm
mechanize
-pullparser
zope.interface
# zope.app # This is a soft-dependence
Modified: Zope3/trunk/src/zope/testbrowser/README.txt
===================================================================
--- Zope3/trunk/src/zope/testbrowser/README.txt 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/testbrowser/README.txt 2006-06-21 21:54:07 UTC (rev 68784)
@@ -1128,7 +1128,7 @@
Hand-Holding
------------
-Instances of the various objects ensure that users don't accidentally set
+Instances of the various objects ensure that users don't set incorrect
instance attributes accidentally.
>>> browser.nonexistant = None
@@ -1158,8 +1158,25 @@
This section includes tests for bugs that were found and then fixed that don't
fit into the more documentation-centric sections above.
+Spaces in URL
+~~~~~~~~~~~~~
+
When URLs have spaces in them, they're handled correctly (before the bug was
fixed, you'd get "ValueError: too many values to unpack"):
>>> browser.open('http://localhost/@@/testbrowser/navigate.html')
>>> browser.getLink('Spaces in the URL').click()
+
+.goBack() Truncation
+~~~~~~~~~~~~~~~~~~~~
+
+The .goBack() method used to truncate the .contents.
+
+ >>> browser.open('http://localhost/@@/testbrowser/navigate.html')
+ >>> actual_length = len(browser.contents)
+
+ >>> browser.open('http://localhost/@@/testbrowser/navigate.html')
+ >>> browser.open('http://localhost/@@/testbrowser/simple.html')
+ >>> browser.goBack()
+ >>> len(browser.contents) == actual_length
+ True
Modified: Zope3/trunk/src/zope/testbrowser/browser.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/browser.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/testbrowser/browser.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -22,7 +22,6 @@
from cStringIO import StringIO
import mechanize
import operator
-import pullparser
import re
import sys
import time
@@ -217,16 +216,24 @@
"""See zope.testbrowser.interfaces.IBrowser"""
self._start_timer()
try:
- self.mech_browser.open(url, data)
- except urllib2.HTTPError, e:
- if e.code >= 200 and e.code <= 299:
- # 200s aren't really errors
- pass
- else:
- raise
+ try:
+ self.mech_browser.open(url, data)
+ except urllib2.HTTPError, e:
+ if e.code >= 200 and e.code <= 299:
+ # 200s aren't really errors
+ pass
+ else:
+ raise
+ finally:
+ self._stop_timer()
+ self._changed()
- self._stop_timer()
- self._changed()
+ # if the headers don't have a status, I suppose there can't be an error
+ if 'Status' in self.headers:
+ code, msg = self.headers['Status'].split(' ', 1)
+ code = int(code)
+ if code >= 400:
+ raise urllib2.HTTPError(url, code, msg, self.headers, fp=None)
def _start_timer(self):
self.timer.start()
@@ -294,13 +301,20 @@
for control in f.controls:
phantom = control.type in ('radio', 'checkbox')
if include_subcontrols and (
+ phantom or control.type=='select'):
- phantom or control.type=='select'):
+ found_one = False
for i in control.items:
for l in i.get_labels():
if matches(l.text):
found.append((i, f))
+ found_one = True
break
+
+ if found_one:
+ del found_one
+ continue
+
if not phantom:
for l in control.get_labels():
if matches(l.text):
Modified: Zope3/trunk/src/zope/testbrowser/testing.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/testing.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/testbrowser/testing.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -22,7 +22,6 @@
from cStringIO import StringIO
import mechanize
-import ClientCookie
import transaction
from zope.testbrowser import browser
@@ -127,28 +126,23 @@
class PublisherMechanizeBrowser(mechanize.Browser):
"""Special ``mechanize`` browser using the Zope Publisher HTTP handler."""
- handler_classes = {
- # scheme handlers
- "http": PublisherHTTPHandler,
+ default_schemes = ['http']
+ default_others = ['_http_error', '_http_request_upgrade',
+ '_http_default_error']
+ default_features = ['_redirect', '_cookies', '_referer', '_refresh',
+ '_equiv', '_basicauth', '_digestauth', '_seek' ]
- "_http_error": ClientCookie.HTTPErrorProcessor,
- "_http_request_upgrade": ClientCookie.HTTPRequestUpgradeProcessor,
- "_http_default_error": urllib2.HTTPDefaultErrorHandler,
+ def __init__(self, *args, **kws):
+ inherited_handlers = ['_unknown', '_http_error',
+ '_http_request_upgrade', '_http_default_error', '_basicauth',
+ '_digestauth', '_redirect', '_cookies', '_referer',
+ '_refresh', '_equiv', '_seek', '_gzip']
- # feature handlers
- "_authen": urllib2.HTTPBasicAuthHandler,
- "_redirect": ClientCookie.HTTPRedirectHandler,
- "_cookies": ClientCookie.HTTPCookieProcessor,
- "_refresh": ClientCookie.HTTPRefreshProcessor,
- "_referer": mechanize.Browser.handler_classes['_referer'],
- "_equiv": ClientCookie.HTTPEquivProcessor,
- "_seek": ClientCookie.SeekableProcessor,
- }
+ self.handler_classes = {"http": PublisherHTTPHandler}
+ for name in inherited_handlers:
+ self.handler_classes[name] = mechanize.Browser.handler_classes[name]
- default_schemes = ["http"]
- default_others = ["_http_error", "_http_request_upgrade",
- "_http_default_error"]
- default_features = ["_authen", "_redirect", "_cookies", "_seek"]
+ mechanize.Browser.__init__(self, *args, **kws)
class Browser(browser.Browser):
Modified: Zope3/trunk/src/zope/testbrowser/tests.py
===================================================================
--- Zope3/trunk/src/zope/testbrowser/tests.py 2006-06-21 17:59:57 UTC (rev 68783)
+++ Zope3/trunk/src/zope/testbrowser/tests.py 2006-06-21 21:54:07 UTC (rev 68784)
@@ -23,7 +23,6 @@
from cStringIO import StringIO
import mechanize
-import ClientCookie
from zope.testbrowser import browser
from zope.testing import renormalizing, doctest
@@ -128,18 +127,18 @@
# scheme handlers
"http": FauxHTTPHandler,
- "_http_error": ClientCookie.HTTPErrorProcessor,
- "_http_request_upgrade": ClientCookie.HTTPRequestUpgradeProcessor,
+ "_http_error": mechanize.HTTPErrorProcessor,
+ "_http_request_upgrade": mechanize.HTTPRequestUpgradeProcessor,
"_http_default_error": urllib2.HTTPDefaultErrorHandler,
# feature handlers
"_authen": urllib2.HTTPBasicAuthHandler,
- "_redirect": ClientCookie.HTTPRedirectHandler,
- "_cookies": ClientCookie.HTTPCookieProcessor,
- "_refresh": ClientCookie.HTTPRefreshProcessor,
+ "_redirect": mechanize.HTTPRedirectHandler,
+ "_cookies": mechanize.HTTPCookieProcessor,
+ "_refresh": mechanize.HTTPRefreshProcessor,
"_referer": mechanize.Browser.handler_classes['_referer'],
- "_equiv": ClientCookie.HTTPEquivProcessor,
- "_seek": ClientCookie.SeekableProcessor,
+ "_equiv": mechanize.HTTPEquivProcessor,
+ "_seek": mechanize.SeekableProcessor,
}
default_schemes = ["http"]
@@ -222,6 +221,8 @@
checker = renormalizing.RENormalizing([
(re.compile(r'^--\S+\.\S+\.\S+', re.M), '-'*30),
(re.compile(r'boundary=\S+\.\S+\.\S+'), 'boundary='+'-'*30),
+ (re.compile(r'^---{10}.*', re.M), '-'*30),
+ (re.compile(r'boundary=-{10}.*'), 'boundary='+'-'*30),
(re.compile('User-agent:\s+\S+'), 'User-agent: XXX'),
(re.compile('Content-length:\s+\S+'), 'Content-length: 123'),
])
More information about the Zope3-Checkins
mailing list