[Zodb-checkins] SVN: ZODB/branches/3.8/ Merged the ctheune-bushy-directory-3.8 branch which makes blobs work

Jim Fulton jim at zope.com
Wed Aug 27 06:05:59 EDT 2008


Log message for revision 90419:
  Merged the ctheune-bushy-directory-3.8 branch which makes blobs work
  for large databases on systems that don't allow many subdirectories.
  

Changed:
  U   ZODB/branches/3.8/NEWS.txt
  U   ZODB/branches/3.8/setup.py
  U   ZODB/branches/3.8/src/ZEO/ClientStorage.py
  U   ZODB/branches/3.8/src/ZEO/tests/testZEO.py
  U   ZODB/branches/3.8/src/ZODB/Connection.py
  U   ZODB/branches/3.8/src/ZODB/blob.py
  A   ZODB/branches/3.8/src/ZODB/scripts/migrateblobs.py
  A   ZODB/branches/3.8/src/ZODB/tests/blob_layout.txt
  U   ZODB/branches/3.8/src/ZODB/tests/blob_tempdir.txt
  U   ZODB/branches/3.8/src/ZODB/tests/blob_transaction.txt
  U   ZODB/branches/3.8/src/ZODB/tests/testblob.py

-=-
Modified: ZODB/branches/3.8/NEWS.txt
===================================================================
--- ZODB/branches/3.8/NEWS.txt	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/NEWS.txt	2008-08-27 10:05:58 UTC (rev 90419)
@@ -1,5 +1,4 @@
 
-
 Whats new in ZODB 3.8.1
 =======================
 

Modified: ZODB/branches/3.8/setup.py
===================================================================
--- ZODB/branches/3.8/setup.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/setup.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -20,7 +20,7 @@
 interface, rich transaction support, and undo.
 """
 
-VERSION = "3.8.1dev"
+VERSION = "3.8.1dev.bushy"
 
 # The (non-obvious!) choices for the Trove Development Status line:
 # Development Status :: 5 - Production/Stable

Modified: ZODB/branches/3.8/src/ZEO/ClientStorage.py
===================================================================
--- ZODB/branches/3.8/src/ZEO/ClientStorage.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZEO/ClientStorage.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -865,9 +865,7 @@
 
     def _storeBlob_shared(self, oid, serial, data, filename, version, txn):
         # First, move the blob into the blob directory
-        dir = self.fshelper.getPathForOID(oid)
-        if not os.path.exists(dir):
-            os.mkdir(dir)
+        self.fshelper.getPathForOID(oid, create=True)
         fd, target = self.fshelper.blob_mkstemp(oid, serial)
         os.close(fd)
 
@@ -935,14 +933,7 @@
             raise POSException.POSKeyError("No blob file", oid, serial)
 
         # First, we'll create the directory for this oid, if it doesn't exist. 
-        targetpath = self.fshelper.getPathForOID(oid)
-        if not os.path.exists(targetpath):
-            try:
-                os.makedirs(targetpath, 0700)
-            except OSError:
-                # We might have lost a race.  If so, the directory
-                # must exist now
-                assert os.path.exists(targetpath)
+        targetpath = self.fshelper.getPathForOID(oid, create=True)
 
         # OK, it's not here and we (or someone) needs to get it.  We
         # want to avoid getting it multiple times.  We want to avoid
@@ -1004,7 +995,7 @@
                 pass
 
     def temporaryDirectory(self):
-        return self.blob_dir
+        return self.fshelper.temp_dir
 
     def tpc_vote(self, txn):
         """Storage API: vote on a transaction."""
@@ -1128,19 +1119,15 @@
                     assert s == tid, (s, tid)
                     self._cache.store(oid, version, s, None, data)
 
-        
         if self.fshelper is not None:
             blobs = self._tbuf.blobs
             while blobs:
                 oid, blobfilename = blobs.pop()
-                targetpath = self.fshelper.getPathForOID(oid)
-                if not os.path.exists(targetpath):
-                    os.makedirs(targetpath, 0700)
+                targetpath = self.fshelper.getPathForOID(oid, create=True)
                 rename_or_copy_blob(blobfilename,
                           self.fshelper.getBlobFilename(oid, tid),
                           )
 
-                    
         self._tbuf.clear()
 
     def undo(self, trans_id, txn):

Modified: ZODB/branches/3.8/src/ZEO/tests/testZEO.py
===================================================================
--- ZODB/branches/3.8/src/ZEO/tests/testZEO.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZEO/tests/testZEO.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -494,8 +494,7 @@
             self._storage.tpc_abort(t)
             raise
         self.assert_(not os.path.exists(tfname))
-        filename = os.path.join(self.blobdir, oid_repr(oid),
-                                tid_repr(revid) + BLOB_SUFFIX)
+        filename = self._storage.fshelper.getBlobFilename(oid, revid)
         self.assert_(os.path.exists(filename))
         self.assertEqual(somedata, open(filename).read())
 
@@ -543,7 +542,7 @@
         self.assert_((os.stat(filename).st_mode & stat.S_IREAD))
 
     def checkTemporaryDirectory(self):
-        self.assertEquals(self.blob_cache_dir,
+        self.assertEquals(os.path.join(self.blob_cache_dir, 'tmp'),
                           self._storage.temporaryDirectory())
 
     def checkTransactionBufferCleanup(self):
@@ -610,18 +609,16 @@
                 d1 = f.read(8096)
                 d2 = somedata.read(8096)
                 self.assertEqual(d1, d2)
-                
-        
-        # The file should have been copied to the server:
-        filename = os.path.join(self.blobdir, oid_repr(oid),
-                                tid_repr(revid) + BLOB_SUFFIX)
-        check_data(filename)
 
-        # It should also be in the cache:
-        filename = os.path.join(self.blob_cache_dir, oid_repr(oid),
-                                tid_repr(revid) + BLOB_SUFFIX)
+        # The file should be in the cache ...
+        filename = self._storage.fshelper.getBlobFilename(oid, revid)
         check_data(filename)
 
+        # ... and on the server
+        server_filename = filename.replace(self.blob_cache_dir, self.blobdir)
+        self.assert_(server_filename.startswith(self.blobdir))
+        check_data(server_filename)
+
         # If we remove it from the cache and call loadBlob, it should
         # come back. We can do this in many threads.  We'll instrument
         # the method that is used to request data from teh server to

Modified: ZODB/branches/3.8/src/ZODB/Connection.py
===================================================================
--- ZODB/branches/3.8/src/ZODB/Connection.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZODB/Connection.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -1251,7 +1251,7 @@
                   transaction):
         serial = self.store(oid, serial, data, version, transaction)
 
-        targetpath = self._getBlobPath(oid)
+        targetpath = self._getBlobPath()
         if not os.path.exists(targetpath):
             os.makedirs(targetpath, 0700)
 
@@ -1270,14 +1270,12 @@
             return self._storage.loadBlob(oid, serial)
         return filename
 
-    def _getBlobPath(self, oid):
-        return os.path.join(self.temporaryDirectory(),
-                            utils.oid_repr(oid)
-                            )
+    def _getBlobPath(self):
+        return os.path.join(self.temporaryDirectory(), 'savepoints')
 
     def _getCleanFilename(self, oid, tid):
-        return os.path.join(self._getBlobPath(oid),
-                            "%s%s" % (utils.tid_repr(tid), SAVEPOINT_SUFFIX,)
+        return os.path.join(self._getBlobPath(),
+                            "%s-%s%s" % (utils.oid_repr(oid), utils.tid_repr(tid), SAVEPOINT_SUFFIX,)
                             )
 
     def temporaryDirectory(self):

Modified: ZODB/branches/3.8/src/ZODB/blob.py
===================================================================
--- ZODB/branches/3.8/src/ZODB/blob.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZODB/blob.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -15,8 +15,10 @@
 """
 
 import base64
+import binascii
 import logging
 import os
+import re
 import shutil
 import stat
 import sys
@@ -43,6 +45,9 @@
 BLOB_SUFFIX = ".blob"
 SAVEPOINT_SUFFIX = ".spb"
 
+LAYOUT_MARKER = '.layout'
+LAYOUTS = {}
+
 valid_modes = 'r', 'w', 'r+', 'a'
 
 # Threading issues:
@@ -292,22 +297,43 @@
     # with blobs and storages needn't indirect through this if they
     # want to perform blob storage differently.
 
-    def __init__(self, base_dir):
-        self.base_dir = base_dir
+    def __init__(self, base_dir, layout_name='automatic'):
+        self.base_dir = os.path.normpath(base_dir) + '/'
         self.temp_dir = os.path.join(base_dir, 'tmp')
 
+        if layout_name == 'automatic':
+            layout_name = auto_layout_select(base_dir)
+        if layout_name == 'lawn':
+            log('The `lawn` blob directory layout is deprecated due to '
+                'scalability issues on some file systems, please consider '
+                'migrating to the `bushy` layout.', level=logging.WARN)
+        self.layout_name = layout_name 
+        self.layout = LAYOUTS[layout_name]
+
     def create(self):
         if not os.path.exists(self.base_dir):
             os.makedirs(self.base_dir, 0700)
-            log("Blob cache directory '%s' does not exist. "
-                "Created new directory." % self.base_dir,
-                level=logging.INFO)
+            log("Blob directory '%s' does not exist. "
+                "Created new directory." % self.base_dir)
         if not os.path.exists(self.temp_dir):
             os.makedirs(self.temp_dir, 0700)
             log("Blob temporary directory '%s' does not exist. "
-                "Created new directory." % self.temp_dir,
-                level=logging.INFO)
+                "Created new directory." % self.temp_dir)
 
+        if not os.path.exists(os.path.join(self.base_dir, LAYOUT_MARKER)):
+            layout_marker = open(
+                os.path.join(self.base_dir, LAYOUT_MARKER), 'wb')
+            layout_marker.write(self.layout_name)
+        else:
+            layout_marker = open(
+                os.path.join(self.base_dir, LAYOUT_MARKER), 'rb')
+            layout = layout_marker.read().strip()
+            if layout != self.layout_name:
+                raise ValueError(
+                    "Directory layout `%s` selected for blob directory %s, but "
+                    "marker found for layout `%s`" %
+                    (self.layout_name, self.base_dir, layout))
+
     def isSecure(self, path):
         """Ensure that (POSIX) path mode bits are 0700."""
         return (os.stat(path).st_mode & 077) == 0
@@ -317,19 +343,51 @@
             log('Blob dir %s has insecure mode setting' % self.base_dir,
                 level=logging.WARNING)
 
-    def getPathForOID(self, oid):
+    def getPathForOID(self, oid, create=False):
         """Given an OID, return the path on the filesystem where
         the blob data relating to that OID is stored.
 
+        If the create flag is given, the path is also created if it didn't
+        exist already.
+
         """
-        return os.path.join(self.base_dir, utils.oid_repr(oid))
+        # OIDs are numbers and sometimes passed around as integers. For our
+        # computations we rely on the 64-bit packed string representation.
+        if isinstance(oid, int):
+            oid = utils.p64(oid)
 
+        path = self.layout.oid_to_path(oid)
+        path = os.path.join(self.base_dir, path)
+
+        if create and not os.path.exists(path):
+            try:
+                os.makedirs(path, 0700)
+            except OSError:
+                # We might have lost a race.  If so, the directory
+                # must exist now
+                assert os.path.exists(targetpath)
+        return path
+
+    def getOIDForPath(self, path):
+        """Given a path, return an OID, if the path is a valid path for an
+        OID. The inverse function to `getPathForOID`.
+
+        Raises ValueError if the path is not valid for an OID.
+
+        """
+        path = path[len(self.base_dir):]
+        return self.layout.path_to_oid(path)
+
     def getBlobFilename(self, oid, tid):
         """Given an oid and a tid, return the full filename of the
         'committed' blob file related to that oid and tid.
 
         """
         oid_path = self.getPathForOID(oid)
+        # TIDs are numbers and sometimes passed around as integers. For our
+        # computations we rely on the 64-bit packed string representation
+        if isinstance(tid, int):
+            tid = utils.p64(tid)
         filename = "%s%s" % (utils.tid_repr(tid), BLOB_SUFFIX)
         return os.path.join(oid_path, filename)
 
@@ -359,10 +417,9 @@
         if not filename.endswith(BLOB_SUFFIX):
             return None, None
         path, filename = os.path.split(filename)
-        oid = os.path.split(path)[1]
+        oid = self.getOIDForPath(path)
 
         serial = filename[:-len(BLOB_SUFFIX)]
-        oid = utils.repr_to_oid(oid)
         serial = utils.repr_to_oid(serial)
         return oid, serial 
 
@@ -372,30 +429,110 @@
 
         """
         oids = []
-        base_dir = self.base_dir
-        for oidpath in os.listdir(base_dir):
-            for filename in os.listdir(os.path.join(base_dir, oidpath)):
-                blob_path = os.path.join(base_dir, oidpath, filename)
+        for oid, oidpath in self.listOIDs():
+            for filename in os.listdir(oidpath):
+                blob_path = os.path.join(oidpath, filename)
                 oid, serial = self.splitBlobFilename(blob_path)
                 if search_serial == serial:
                     oids.append(oid)
         return oids
 
     def listOIDs(self):
-        """Lists all OIDs and their paths.
-
+        """Iterates over all paths under the base directory that contain blob
+        files.
         """
-        for candidate in os.listdir(self.base_dir):
-            if candidate == 'tmp':
+        for path, dirs, files in os.walk(self.base_dir):
+            # Make sure we traverse in a stable order. This is mainly to make
+            # testing predictable.
+            dirs.sort()
+            files.sort()
+            try:
+                oid = self.getOIDForPath(path)
+            except ValueError:
                 continue
-            oid = utils.repr_to_oid(candidate)
-            yield oid, self.getPathForOID(oid)
+            yield oid, path
 
 
 class BlobStorageError(Exception):
     """The blob storage encountered an invalid state."""
 
+def auto_layout_select(path):
+    # A heuristic to look at a path and determine which directory layout to
+    # use.
+    layout_marker = os.path.join(path, LAYOUT_MARKER)
+    if not os.path.exists(path):
+        log('Blob directory %s does not exist. '
+            'Selected `bushy` layout. ' % path)
+        layout = 'bushy'
+    elif len(os.listdir(path)) == 0:
+        log('Blob directory `%s` is unused and has no layout marker set. '
+            'Selected `bushy` layout. ' % path)
+        layout = 'bushy'
+    elif LAYOUT_MARKER not in os.listdir(path):
+        log('Blob directory `%s` is used but has no layout marker set. '
+            'Selected `lawn` layout. ' % path)
+        layout = 'lawn'
+    else:
+        layout = open(layout_marker, 'rb').read()
+        layout = layout.strip()
+        log('Blob directory `%s` has layout marker set. '
+            'Selected `%s` layout. ' % (path, layout))
+    return layout
 
+
+class BushyLayout(object):
+    """A bushy directory layout for blob directories.
+
+    Creates an 8-level directory structure (one level per byte) in
+    big-endian order from the OID of an object.
+
+    """
+
+    blob_path_pattern = r'^' + (r'0x[0-9a-f]{1,2}/*'*8) + r'$'
+    blob_path_pattern = re.compile(blob_path_pattern)
+
+    def oid_to_path(self, oid):
+        directories = []
+        # Create the bushy directory structure with the least significant byte
+        # first
+        for byte in str(oid):
+            directories.append('0x%s' % binascii.hexlify(byte))
+        return '/'.join(directories)
+
+    def path_to_oid(self, path):
+        if self.blob_path_pattern.match(path) is None:
+            raise ValueError("Not a valid OID path: `%s`" % path)
+        path = path.split('/')
+        # Each path segment stores a byte in hex representation. Turn it into
+        # an int and then get the character for our byte string.
+        oid = ''.join(binascii.unhexlify(byte[2:]) for byte in path)
+        return oid
+
+LAYOUTS['bushy'] = BushyLayout()
+
+
+class LawnLayout(object):
+    """A shallow directory layout for blob directories.
+
+    Creates a single level of directories (one for each oid).
+
+    """
+
+    def oid_to_path(self, oid):
+        return utils.oid_repr(oid)
+
+    def path_to_oid(self, path):
+        try:
+            if path == '':
+                # This is a special case where repr_to_oid converts '' to the
+                # OID z64.
+                raise TypeError()
+            return utils.repr_to_oid(path)
+        except TypeError:
+            raise ValueError('Not a valid OID path: `%s`' % path)
+
+LAYOUTS['lawn'] = LawnLayout()
+
 class BlobStorage(SpecificationDecoratorBase):
     """A storage to support blobs."""
 
@@ -406,13 +543,13 @@
     __slots__ = ('fshelper', 'dirty_oids', '_BlobStorage__supportsUndo',
                  '_blobs_pack_is_in_progress', )
 
-    def __new__(self, base_directory, storage):
+    def __new__(self, base_directory, storage, layout='automatic'):
         return SpecificationDecoratorBase.__new__(self, storage)
 
-    def __init__(self, base_directory, storage):
+    def __init__(self, base_directory, storage, layout='automatic'):
         # XXX Log warning if storage is ClientStorage
         SpecificationDecoratorBase.__init__(self, storage)
-        self.fshelper = FilesystemHelper(base_directory)
+        self.fshelper = FilesystemHelper(base_directory, layout)
         self.fshelper.create()
         self.fshelper.checkSecure()
         self.dirty_oids = []
@@ -444,10 +581,7 @@
 
         self._lock_acquire()
         try:
-            targetpath = self.fshelper.getPathForOID(oid)
-            if not os.path.exists(targetpath):
-                os.makedirs(targetpath, 0700)
-
+            self.fshelper.getPathForOID(oid, create=True)
             targetname = self.fshelper.getBlobFilename(oid, serial)
             rename_or_copy_blob(blobfilename, targetname)
 
@@ -493,7 +627,6 @@
         # if they are still needed by attempting to load the revision
         # of that object from the database.  This is maybe the slowest
         # possible way to do this, but it's safe.
-        base_dir = self.fshelper.base_dir
         for oid, oid_path in self.fshelper.listOIDs():
             files = os.listdir(oid_path)
 
@@ -501,7 +634,6 @@
                 filepath = os.path.join(oid_path, filename)
                 whatever, serial = self.fshelper.splitBlobFilename(filepath)
                 try:
-                    fn = self.fshelper.getBlobFilename(oid, serial)
                     self.loadSerial(oid, serial)
                 except POSKeyError:
                     remove_committed(filepath)
@@ -511,7 +643,6 @@
 
     @non_overridable
     def _packNonUndoing(self, packtime, referencesf):
-        base_dir = self.fshelper.base_dir
         for oid, oid_path in self.fshelper.listOIDs():
             exists = True
             try:
@@ -565,21 +696,10 @@
     @non_overridable
     def getSize(self):
         """Return the size of the database in bytes."""
-        orig_size = getProxiedObject(self).getSize()
-        blob_size = 0
-        base_dir = self.fshelper.base_dir
-        for oid in os.listdir(base_dir):
-            sub_dir = os.path.join(base_dir, oid)
-            if not os.path.isdir(sub_dir):
-                continue
-            for serial in os.listdir(sub_dir):
-                if not serial.endswith(BLOB_SUFFIX):
-                    continue
-                file_path = os.path.join(base_dir, oid, serial)
-                blob_size += os.stat(file_path).st_size
+        # XXX The old way of computing is way to resource hungry. We need to
+        # do some kind of estimation instead.
+        return getProxiedObject(self).getSize()
 
-        return orig_size + blob_size
-
     @non_overridable
     def undo(self, serial_id, transaction):
         undo_serial, keys = getProxiedObject(self).undo(serial_id, transaction)
@@ -598,7 +718,6 @@
             # we get all the blob oids on the filesystem related to the
             # transaction we want to undo.
             for oid in self.fshelper.getOIDsForSerial(serial_id):
-
                 # we want to find the serial id of the previous revision
                 # of this blob object.
                 load_result = self.loadBefore(oid, serial_id)

Copied: ZODB/branches/3.8/src/ZODB/scripts/migrateblobs.py (from rev 90416, ZODB/branches/ctheune-bushy-directory-3.8/src/ZODB/scripts/migrateblobs.py)
===================================================================
--- ZODB/branches/3.8/src/ZODB/scripts/migrateblobs.py	                        (rev 0)
+++ ZODB/branches/3.8/src/ZODB/scripts/migrateblobs.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -0,0 +1,74 @@
+##############################################################################
+#
+# Copyright (c) 2008 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""A script to migrate a blob directory into a different layout.
+"""
+
+import logging
+import optparse
+import os
+
+from ZODB.blob import FilesystemHelper, rename_or_copy_blob
+from ZODB.utils import cp, oid_repr
+
+
+def link_or_copy(f1, f2):
+    try:
+        os.link(f1, f2)
+    except OSError:
+        shutil.copy(f1, f2)
+
+
+def migrate(source, dest, layout):
+    source_fsh = FilesystemHelper(source)
+    source_fsh.create()
+    dest_fsh = FilesystemHelper(dest, layout)
+    dest_fsh.create()
+    print "Migrating blob data from `%s` (%s) to `%s` (%s)" % (
+        source, source_fsh.layout_name, dest, dest_fsh.layout_name)
+    for oid, path in source_fsh.listOIDs():
+        dest_path = dest_fsh.getPathForOID(oid, create=True)
+        files = os.listdir(path)
+        for file in files:
+            source_file = os.path.join(path, file)
+            dest_file = os.path.join(dest_path, file)
+            link_or_copy(source_file, dest_file)
+        print "\tOID: %s - %s files " % (oid_repr(oid), len(files))
+
+
+def main(source=None, dest=None, layout="bushy"):
+    usage = "usage: %prog [options] <source> <dest> <layout>"
+    description = ("Create the new directory <dest> and migrate all blob "
+                   "data <source> to <dest> while using the new <layout> for "
+                   "<dest>")
+
+    parser = optparse.OptionParser(usage=usage, description=description)
+    parser.add_option("-l", "--layout",
+                      default=layout, type='choice',
+                      choices=['bushy', 'lawn'],
+                      help="Define the layout to use for the new directory "
+                      "(bushy or lawn). Default: %default")
+    options, args = parser.parse_args()
+
+    if not len(args) == 2:
+        parser.error("source and destination must be given")
+
+    logging.getLogger().addHandler(logging.StreamHandler())
+    logging.getLogger().setLevel(0)
+
+    source, dest = args
+    migrate(source, dest, options.layout)
+
+
+if __name__ == '__main__':
+    main()

Copied: ZODB/branches/3.8/src/ZODB/tests/blob_layout.txt (from rev 90416, ZODB/branches/ctheune-bushy-directory-3.8/src/ZODB/tests/blob_layout.txt)
===================================================================
--- ZODB/branches/3.8/src/ZODB/tests/blob_layout.txt	                        (rev 0)
+++ ZODB/branches/3.8/src/ZODB/tests/blob_layout.txt	2008-08-27 10:05:58 UTC (rev 90419)
@@ -0,0 +1,281 @@
+======================
+Blob directory layouts
+======================
+
+The internal structure of the blob directories is governed by so called
+`layouts`. The current default layout is called `bushy`.
+
+The original blob implementation used a layout that we now call `lawn` and
+which is still available for backwards compatibility.
+
+Layouts implement two methods: one for computing a relative path for an
+OID and one for turning a relative path back into an OID.
+
+Our terminology is roughly the same as used in `DirectoryStorage`.
+
+The `bushy` layout
+==================
+
+The bushy layout splits the OID into the 8 byte parts, reverses them and
+creates one directory level for each part, named by the hexlified
+representation of the byte value. This results in 8 levels of directories, the
+leaf directories being used for the revisions of the blobs and at most 256
+entries per directory level:
+
+>>> from ZODB.blob import BushyLayout
+>>> bushy = BushyLayout()
+>>> bushy.oid_to_path('\x00\x00\x00\x00\x00\x00\x00\x00')
+'0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x00'
+>>> bushy.oid_to_path('\x00\x00\x00\x00\x00\x00\x00\x01')
+'0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x01'
+
+>>> bushy.path_to_oid('0x01/0x00/0x00/0x00/0x00/0x00/0x00/0x00')
+'\x01\x00\x00\x00\x00\x00\x00\x00'
+>>> bushy.path_to_oid('0xff/0x00/0x00/0x00/0x00/0x00/0x00/0x00')
+'\xff\x00\x00\x00\x00\x00\x00\x00'
+
+Paths that do not represent an OID will cause a ValueError:
+
+>>> bushy.path_to_oid('tmp')
+Traceback (most recent call last):
+ValueError: Not a valid OID path: `tmp`
+
+
+The `lawn` layout
+=================
+
+The lawn layout creates on directory for each blob named by the blob's hex
+representation of its OID. This has some limitations on various file systems
+like performance penalties or the inability to store more than a given number
+of blobs at the same time (e.g. 32k on ext3).
+
+>>> from ZODB.blob import LawnLayout
+>>> lawn = LawnLayout()
+>>> lawn.oid_to_path('\x00\x00\x00\x00\x00\x00\x00\x00')
+'0x00'
+>>> lawn.oid_to_path('\x00\x00\x00\x00\x00\x00\x00\x01')
+'0x01'
+
+>>> lawn.path_to_oid('0x01')
+'\x00\x00\x00\x00\x00\x00\x00\x01'
+
+Paths that do not represent an OID will cause a ValueError:
+
+>>> lawn.path_to_oid('tmp')
+Traceback (most recent call last):
+ValueError: Not a valid OID path: `tmp`
+>>> lawn.path_to_oid('')
+Traceback (most recent call last):
+ValueError: Not a valid OID path: ``
+
+
+Auto-detecting the layout of a directory
+========================================
+
+To allow easier migration, we provide an auto-detection feature that analyses a
+blob directory and decides for a strategy to use. In general it prefers to
+choose the `bushy` layout, except if it determines that the directory has
+already been used to create a lawn structure.
+
+>>> from ZODB.blob import auto_layout_select
+
+1. Non-existing directories will trigger a bushy layout:
+
+>>> import tempfile
+>>> import shutil
+>>> d = tempfile.mkdtemp()
+>>> shutil.rmtree(d)
+>>> auto_layout_select(d)
+'bushy'
+
+2. Empty directories will trigger a bushy layout too:
+
+>>> d = tempfile.mkdtemp()
+>>> auto_layout_select(d)
+'bushy'
+
+3. If the directory contains a marker for the strategy it will be used:
+
+>>> from ZODB.blob import LAYOUT_MARKER
+>>> import os.path
+>>> open(os.path.join(d, LAYOUT_MARKER), 'wb').write('bushy')
+>>> auto_layout_select(d)
+'bushy'
+>>> open(os.path.join(d, LAYOUT_MARKER), 'wb').write('lawn')
+>>> auto_layout_select(d)
+'lawn'
+>>> shutil.rmtree(d)
+
+4. If the directory does not contain a marker but other files, we assume that
+it was created with an earlier version of the blob implementation and uses our
+`lawn` layout:
+
+>>> d = tempfile.mkdtemp()
+>>> open(os.path.join(d, '0x0101'), 'wb').write('foo')
+>>> auto_layout_select(d)
+'lawn'
+>>> shutil.rmtree(d)
+
+
+Directory layout markers
+========================
+
+When the file system helper (FSH) is asked to create the directory structure,
+it will leave a marker with the choosen layout if no marker exists yet:
+
+>>> from ZODB.blob import FilesystemHelper
+>>> d = tempfile.mkdtemp()
+>>> blobs = os.path.join(d, 'blobs')
+>>> fsh = FilesystemHelper(blobs)
+>>> fsh.layout_name
+'bushy'
+>>> fsh.create()
+>>> open(os.path.join(blobs, LAYOUT_MARKER), 'rb').read()
+'bushy'
+
+If the FSH finds a marker, then it verifies whether its content matches the
+strategy that was chosen. It will raise an exception if we try to work with a
+directory that has a different marker than the chosen strategy:
+
+>>> fsh = FilesystemHelper(blobs, 'lawn')
+>>> fsh.layout_name
+'lawn'
+>>> fsh.create() # doctest: +ELLIPSIS
+Traceback (most recent call last):
+ValueError: Directory layout `lawn` selected for blob directory /.../blobs/, but marker found for layout `bushy`
+>>> shutil.rmtree(blobs)
+
+This function interacts with the automatic detection in the way, that an
+unmarked directory will be marked the first time when it is auto-guessed and
+the marker will be used in the future:
+
+>>> import ZODB.FileStorage
+>>> from ZODB.blob import BlobStorage
+>>> datafs = os.path.join(d, 'data.fs')
+>>> base_storage = ZODB.FileStorage.FileStorage(datafs)
+
+>>> os.mkdir(blobs)
+>>> open(os.path.join(blobs, 'foo'), 'wb').write('foo')
+>>> blob_storage = BlobStorage(blobs, base_storage)
+>>> blob_storage.fshelper.layout_name
+'lawn'
+>>> open(os.path.join(blobs, LAYOUT_MARKER), 'rb').read()
+'lawn'
+>>> blob_storage = BlobStorage(blobs, base_storage, layout='bushy') # doctest: +ELLIPSIS
+Traceback (most recent call last):
+ValueError: Directory layout `bushy` selected for blob directory /.../blobs/, but marker found for layout `lawn`
+
+
+>>> shutil.rmtree(d)
+
+
+Migrating between directory layouts
+===================================
+
+A script called `migrateblobs.py` is distributed with the ZODB for offline
+migration capabilities between different directory layouts. It can migrate any
+blob directory layout to any other layout. It leaves the original blob
+directory untouched (except from eventually creating a temporary directory and
+the storage layout marker).
+
+The migration is accessible as a library function:
+
+>>> from ZODB.scripts.migrateblobs import migrate
+
+Create a `lawn` directory structure and migrate it to the new `bushy` one:
+
+>>> from ZODB.blob import FilesystemHelper
+>>> d = tempfile.mkdtemp()
+>>> old = os.path.join(d, 'old')
+>>> old_fsh = FilesystemHelper(old, 'lawn')
+>>> old_fsh.create()
+>>> blob1 = old_fsh.getPathForOID(7039, create=True)
+>>> blob2 = old_fsh.getPathForOID(10, create=True)
+>>> blob3 = old_fsh.getPathForOID(7034, create=True)
+>>> open(os.path.join(blob1, 'foo'), 'wb').write('foo')
+>>> open(os.path.join(blob1, 'foo2'), 'wb').write('bar')
+>>> open(os.path.join(blob2, 'foo3'), 'wb').write('baz')
+>>> open(os.path.join(blob2, 'foo4'), 'wb').write('qux')
+>>> open(os.path.join(blob3, 'foo5'), 'wb').write('quux')
+>>> open(os.path.join(blob3, 'foo6'), 'wb').write('corge')
+
+Committed blobs have their permissions set to 000
+
+The migration function is called with the old and the new path and the layout
+that shall be used for the new directory:
+
+>>> bushy = os.path.join(d, 'bushy')
+>>> migrate(old, bushy, 'bushy')  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+Migrating blob data from `/.../old` (lawn) to `/.../bushy` (bushy)
+    OID: 0x0a - 2 files
+    OID: 0x1b7a - 2 files
+    OID: 0x1b7f - 2 files
+
+The new directory now contains the same files in different directories, but
+with the same sizes and permissions:
+
+>>> lawn_files = {}
+>>> for base, dirs, files in os.walk(d):
+...     for file_name in files:
+...         lawn_files[file_name] = os.path.join(base, file_name)
+
+>>> bushy_files = {}
+>>> for base, dirs, files in os.walk(bushy):
+...     for file_name in files:
+...         bushy_files[file_name] = os.path.join(base, file_name)
+
+>>> len(lawn_files) == len(bushy_files)
+True
+
+>>> for file_name, lawn_path in sorted(lawn_files.items()):
+...     if file_name == '.layout':
+...         continue
+...     lawn_stat = os.stat(lawn_path)
+...     bushy_path = bushy_files[file_name]
+...     bushy_stat = os.stat(bushy_path)
+...     print lawn_path, '-->', bushy_path
+...     if ((lawn_stat.st_mode, lawn_stat.st_size) !=
+...         (bushy_stat.st_mode, bushy_stat.st_size)):
+...         print 'oops'
+old/0x1b7f/foo --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7f/foo
+old/0x1b7f/foo2 --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7f/foo2
+old/0x0a/foo3 --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x0a/foo3
+old/0x0a/foo4 --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x0a/foo4
+old/0x1b7a/foo5 --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7a/foo5
+old/0x1b7a/foo6 --> bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7a/foo6
+
+We can also migrate the bushy layout back to the lawn layout:
+
+>>> lawn = os.path.join(d, 'lawn')
+>>> migrate(bushy, lawn, 'lawn')
+Migrating blob data from `/.../bushy` (bushy) to `/.../lawn` (lawn)
+   OID: 0x0a - 2 files
+   OID: 0x1b7a - 2 files
+   OID: 0x1b7f - 2 files
+
+>>> lawn_files = {}
+>>> for base, dirs, files in os.walk(lawn):
+...     for file_name in files:
+...         lawn_files[file_name] = os.path.join(base, file_name)
+
+>>> len(lawn_files) == len(bushy_files)
+True
+
+>>> for file_name, lawn_path in sorted(lawn_files.items()):
+...     if file_name == '.layout':
+...         continue
+...     lawn_stat = os.stat(lawn_path)
+...     bushy_path = bushy_files[file_name]
+...     bushy_stat = os.stat(bushy_path)
+...     print bushy_path, '-->', lawn_path
+...     if ((lawn_stat.st_mode, lawn_stat.st_size) !=
+...         (bushy_stat.st_mode, bushy_stat.st_size)):
+...         print 'oops'
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7f/foo --> lawn/0x1b7f/foo
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7f/foo2 --> lawn/0x1b7f/foo2
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x0a/foo3 --> lawn/0x0a/foo3
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x0a/foo4 --> lawn/0x0a/foo4
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7a/foo5 --> lawn/0x1b7a/foo5
+bushy/0x00/0x00/0x00/0x00/0x00/0x00/0x1b/0x7a/foo6 --> lawn/0x1b7a/foo6
+
+>>> shutil.rmtree(d)

Modified: ZODB/branches/3.8/src/ZODB/tests/blob_tempdir.txt
===================================================================
--- ZODB/branches/3.8/src/ZODB/tests/blob_tempdir.txt	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZODB/tests/blob_tempdir.txt	2008-08-27 10:05:58 UTC (rev 90419)
@@ -32,7 +32,7 @@
   >>> from ZODB.DB import DB
   >>> from tempfile import mkdtemp
   >>> import os.path
-  >>> base_storage = MappingStorage("test")
+  >>> base_storage = MappingStorage('test')
   >>> blob_dir = mkdtemp()
   >>> blob_storage = BlobStorage(blob_dir, base_storage)
   >>> database = DB(blob_storage)

Modified: ZODB/branches/3.8/src/ZODB/tests/blob_transaction.txt
===================================================================
--- ZODB/branches/3.8/src/ZODB/tests/blob_transaction.txt	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZODB/tests/blob_transaction.txt	2008-08-27 10:05:58 UTC (rev 90419)
@@ -191,15 +191,15 @@
     >>> root4['blob1'].open('r').read()
     'this is blob 1woot!this is from connection 3'
 
-BlobStorages implementation of getSize() includes the blob data and adds it to
-the underlying storages result of getSize(). (We need to ensure the last
+BlobStorages implementation of getSize() does not include the blob data and
+only returns what the underlying storages do.  (We need to ensure the last
 number to be an int, otherwise it will be a long on 32-bit platforms and an
 int on 64-bit)::
 
     >>> underlying_size = base_storage.getSize()
     >>> blob_size = blob_storage.getSize()
     >>> int(blob_size - underlying_size)
-    91
+    0
 
 You can't commit a transaction while blob files are open:
 
@@ -243,22 +243,53 @@
     >>> root5['blob'].open("r").read()
     "I'm a happy blob. And I'm singing."
     >>> savepoint = transaction.savepoint(optimistic=True)
+
     >>> root5['blob'].open("r").read()
     "I'm a happy blob. And I'm singing."
+
+Savepoints store the blobs in the `savepoints` directory in the temporary
+directory of the blob storage:
+
+    >>> os.listdir(os.path.join(blob_dir, 'tmp', 'savepoints'))
+    ['0x03-0x....spb']
     >>> transaction.commit()
 
+After committing the transaction, the temporary savepoint files are moved to
+the committed location again:
+
+    >>> os.listdir(os.path.join(blob_dir, 'tmp', 'savepoints'))
+    []
+
 We support non-optimistic savepoints too:
 
     >>> root5['blob'].open("a").write(" And I'm dancing.")
     >>> root5['blob'].open("r").read()
     "I'm a happy blob. And I'm singing. And I'm dancing."
     >>> savepoint = transaction.savepoint()
+
+Again, the savepoint creates a new file for the blob state in the savepoints
+directory:
+
+    >>> os.listdir(os.path.join(blob_dir, 'tmp', 'savepoints'))
+    ['0x03-0x....spb']
+
     >>> root5['blob'].open("w").write(" And the weather is beautiful.")
     >>> savepoint.rollback()
+
+XXX Currently, savepoint state of blobs remains after a rollback:
+
+    >>> os.listdir(os.path.join(blob_dir, 'tmp', 'savepoints'))
+    ['0x03-0x....spb']
+
     >>> root5['blob'].open("r").read()
     "I'm a happy blob. And I'm singing. And I'm dancing."
     >>> transaction.abort()
 
+XXX Currently, savepoint state of blobs remains even after an abort:
+
+    >>> os.listdir(os.path.join(blob_dir, 'tmp', 'savepoints'))
+    ['0x03-0x....spb']
+
 Reading Blobs outside of a transaction
 --------------------------------------
 
@@ -322,9 +353,9 @@
     >>> base_storage = DummyBaseStorage()
     >>> blob_dir2 = mkdtemp()
     >>> blob_storage2 = BlobStorage(blob_dir2, base_storage)
-    >>> committed_blob_dir = os.path.join(blob_dir2, '0')
-    >>> committed_blob_file = os.path.join(committed_blob_dir, '0.blob')
-    >>> os.mkdir(committed_blob_dir)
+    >>> committed_blob_dir = blob_storage2.fshelper.getPathForOID(0)
+    >>> os.makedirs(committed_blob_dir)
+    >>> committed_blob_file = blob_storage2.fshelper.getBlobFilename(0, 0)
     >>> open(os.path.join(committed_blob_file), 'w').write('foo')
     >>> os.path.exists(committed_blob_file)
     True
@@ -341,21 +372,7 @@
 
 Note: This is a counter measure against regression of bug #126007.
 
-getSize with garbage in the directory structure
------------------------------------------------
 
-`getSize` iterates over the existing blob files in the blob directory and adds
-up their size. The blob directory sometimes contains temporary files that the
-getSize function needs to ignore:
-
-    >>> garbage_file = os.path.join(blob_dir, 'garbage')
-    >>> open(garbage_file, 'w').write('garbage')
-    >>> int(blob_storage.getSize())
-    881
-
-
-Note: This is a counter measer against regression of bug #12991.
-
 Teardown
 --------
 

Modified: ZODB/branches/3.8/src/ZODB/tests/testblob.py
===================================================================
--- ZODB/branches/3.8/src/ZODB/tests/testblob.py	2008-08-27 09:59:44 UTC (rev 90418)
+++ ZODB/branches/3.8/src/ZODB/tests/testblob.py	2008-08-27 10:05:58 UTC (rev 90419)
@@ -12,9 +12,9 @@
 #
 ##############################################################################
 
-import base64, os, shutil, tempfile, unittest
+import base64, os, re, shutil, tempfile, unittest
 import time
-from zope.testing import doctest
+from zope.testing import doctest, renormalizing
 import ZODB.tests.util
 
 from ZODB import utils
@@ -105,7 +105,6 @@
         self.here = os.getcwd()
         os.chdir(self.test_dir)
         self.storagefile = 'Data.fs'
-        os.mkdir('blobs')
         self.blob_dir = 'blobs'
 
     def tearDown(self):
@@ -483,7 +482,7 @@
     We can access the blob correctly:
 
     >>> tmpstore.loadBlob(blob_oid, tid) # doctest: +ELLIPSIS
-    '.../0x01/0x...blob'
+    '.../0x00/0x00/0x00/0x00/0x00/0x00/0x00/0x01/0x...blob'
 
     Clean up:
 
@@ -504,9 +503,20 @@
         "blob_basic.txt",  "blob_connection.txt", "blob_transaction.txt",
         "blob_packing.txt", "blob_importexport.txt", "blob_consume.txt",
         "blob_tempdir.txt",
+        optionflags=doctest.ELLIPSIS,
         setUp=ZODB.tests.util.setUp,
         tearDown=ZODB.tests.util.tearDown,
         ))
+    suite.addTest(doctest.DocFileSuite(
+        "blob_layout.txt",
+        optionflags=doctest.ELLIPSIS|doctest.NORMALIZE_WHITESPACE,
+        setUp=ZODB.tests.util.setUp,
+        tearDown=ZODB.tests.util.tearDown,
+        checker = renormalizing.RENormalizing([
+            (re.compile(r'[%(sep)s]' % dict(sep=os.path.sep)), '/'),
+            (re.compile(r'\S+/((old|bushy|lawn)/\S+/foo[23456]?)'), r'\1'),
+            ]),
+        ))
     suite.addTest(doctest.DocTestSuite(
         setUp=ZODB.tests.util.setUp,
         tearDown=ZODB.tests.util.tearDown,



More information about the Zodb-checkins mailing list