[Checkins] SVN: zc.mirrorcheeseshopslashsimple/branches/local-mirror/src/zc/mirrorcheeseshopslashsimple.py savepoint

Andreas Jung andreas at andreas-jung.com
Tue Aug 26 05:45:45 EDT 2008


Log message for revision 90279:
  savepoint
  

Changed:
  U   zc.mirrorcheeseshopslashsimple/branches/local-mirror/src/zc/mirrorcheeseshopslashsimple.py

-=-
Modified: zc.mirrorcheeseshopslashsimple/branches/local-mirror/src/zc/mirrorcheeseshopslashsimple.py
===================================================================
--- zc.mirrorcheeseshopslashsimple/branches/local-mirror/src/zc/mirrorcheeseshopslashsimple.py	2008-08-26 09:41:56 UTC (rev 90278)
+++ zc.mirrorcheeseshopslashsimple/branches/local-mirror/src/zc/mirrorcheeseshopslashsimple.py	2008-08-26 09:45:44 UTC (rev 90279)
@@ -16,6 +16,9 @@
 import os, sys, time, urllib, urllib2, xmlrpclib
 import zc.lockfile
 import util
+from BeautifulSoup import BeautifulSoup
+from glob import fnmatch
+from md5 import md5
 
 lock_file_path = 'pypi-poll-access.lock'
 poll_time_path = 'pypi-poll-timestamp'
@@ -43,6 +46,11 @@
 
 def get_page(dest, package, force=False):
 
+    package_matches = ["zope.app.*",]
+    if not True in [fnmatch.fnmatch(package, package_match) for package_match in package_matches]:
+        return
+
+
     if not util.isASCII(package):
         print 'skipping %r which has a non-ascii name' % `package`
         return
@@ -72,7 +80,72 @@
     write(page, pdest, 'index.html')
     mirror_package(package, page, dest)
 
+def fetch_package(url):
+    try:
+        package_file_data = urllib2.urlopen(url).read()
+    except urllib2.HTTPError, v:
+        if '404' in str(v):             # sigh
+            raise "404: " % url
+    except urllib2.URLError, v:
+        # this happens on that url for example:
+        # http://pypi.python.org/packages/source/a/appwsgi/appwsgi 667.tar.bz2
+        # don't care, just continue.
+        # XXX TODO: urlencode the path so that spaces work.
+        raise "Invalid url: %s" % url
+    return package_file_data
+
+
 def mirror_package(package, page, dest):
+    # XXX TODO: Check if the provided list of links is the same as
+    # the list on the FS and delete local copies in case they're missing
+    # online. Make this configurable.
+
+    html = BeautifulSoup(page)
+    links = [link["href"] for link in html.findAll("a")]
+    # interesting links look like this:
+    # http://pypi.python.org/packages/2.4/4/4Suite-XML/4Suite_XML-1.0.2-py2.4-win32.egg#md5=b561e3750ba422ade50f81f2f70b55e2
+    # Let's split the filename and the md5 hash.
+    for link in links:
+        (url, hash) = urllib.splittag(link)
+        package_dest_path = "%s/%s/%s" % (dest, package, os.path.basename(url))
+
+        if not hash:
+            continue
+        try:
+            (hashname, hash) = hash.split("=")
+        except ValueError:
+            continue
+        if not hashname == "md5":
+            continue
+
+        # XXX TODO: Put this in the config file
+        allowed_matches = ["*.egg", "*.tar.gz", "*.tar.bz2",]
+        if not True in [fnmatch.fnmatch(url, allowed_match) for allowed_match in allowed_matches]:
+            continue
+
+        # alright, fetch the url if the md5 doesn't match an existing package.
+        if os.path.exists(package_dest_path):
+            current_md5_hex = md5(open(package_dest_path, "rb").read()).hexdigest()
+            if current_md5_hex == hash:
+                print "Skipping %s, already there." % package_dest_path
+                continue
+
+        try:
+            package_file_data = fetch_package(url)
+        except:
+            continue
+
+        if not package_file_data:
+            continue
+
+        md5_hex = md5(package_file_data).hexdigest()
+        if not hash == md5_hex:
+            print 'Skipping', `package`, "which doesn't match the provided md5 checksum."
+
+        # save package
+        print "Storing package %s [%s bytes]" % (package_dest_path, len(package_file_data))
+        open(package_dest_path, "wb").write(package_file_data)
+
     print package, dest
 
 def save_time(dest, timestamp):



More information about the Checkins mailing list