[Zodb-checkins] SVN: ZODB/branches/3.3/ Port from ZODB 3.2.

Fri Feb 25 16:51:33 EST 2005

Log message for revision 29302:
  Port from ZODB 3.2.
  
  Stop believing the maximum oid cached in a FileStorage's .index file.
  
  This is a critical bugfix, although the problems it addresses are
  (a) rare; and, (b) not entirely fixed yet (more checkins to come).
  
  The true max oid is found efficiently now by exploiting the recently-added
  fsIndex.maxKey() method (which was, of course, added for this purpose).
  
  Also fix that the .index file could get updated on disk when the
  FileStorage was opened in read-only mode.  The code was trying to prevent
  this, but missed the most obvious rewrite path.
  
  Incidentally improved many obsolete and/or incorrect comments.
  

Changed:
  U   ZODB/branches/3.3/NEWS.txt
  U   ZODB/branches/3.3/src/ZODB/FileStorage/FileStorage.py
  U   ZODB/branches/3.3/src/ZODB/tests/testFileStorage.py

-=-
Modified: ZODB/branches/3.3/NEWS.txt
===================================================================

--- ZODB/branches/3.3/NEWS.txt	2005-02-25 20:46:19 UTC (rev 29301)
+++ ZODB/branches/3.3/NEWS.txt	2005-02-25 21:51:33 UTC (rev 29302)
@@ -28,6 +28,34 @@
 depending on platform, and should suffer far fewer (if any) intermittent
 "timed out waiting for storage to connect" failures.
 
+
+FileStorage
+-----------
+
+- A FileStorage's index file tried to maintain the index's largest oid as a
+  separate piece of data, incrementally updated over the storage's lifetime.
+  This scheme was more complicated than necessary, so was also more brittle
+  and slower than necessary.  It indirectly participated in a rare but
+  critical bug:  when a FileStorage was created via
+  ``copyTransactionsFrom()``, the "maximum oid" saved in the index file was
+  always 0.  Use that FileStorage, and it could then create "new" oids
+  starting over at 0 again, despite that those oids were already in use by
+  old objects in the database.  Packing a FileStorage has no reason to
+  try to update the maximum oid in the index file either, so this kind of
+  damage could (and did) persist even across packing.
+
+  The index file's maximum-oid data is ignored now, but is still written
+  out so that ``.index`` files can be read by older versions of ZODB.
+  Finding the true maximum oid is done now by exploiting that the main
+  index is really a kind of BTree (long ago, this wasn't true), and finding
+  the largest key in a BTree is inexpensive.
+
+- A FileStorage's index file could be updated on disk even if the storage
+  was opened in read-only mode.  That bug has been repaired.
+
+- An efficient ``maxKey()`` implementation was added to class ``fsIndex``.
+
+
 Pickle (in-memory Connection) Cache
 -----------------------------------
 

Modified: ZODB/branches/3.3/src/ZODB/FileStorage/FileStorage.py
===================================================================
--- ZODB/branches/3.3/src/ZODB/FileStorage/FileStorage.py	2005-02-25 20:46:19 UTC (rev 29301)
+++ ZODB/branches/3.3/src/ZODB/FileStorage/FileStorage.py	2005-02-25 21:51:33 UTC (rev 29302)
@@ -40,13 +40,8 @@
      import FileStorageFormatter, DataHeader, TxnHeader, DATA_HDR, \
      DATA_HDR_LEN, TRANS_HDR, TRANS_HDR_LEN, CorruptedDataError
 from ZODB.loglevels import BLATHER
+from ZODB.fsIndex import fsIndex
 
-try:
-    from ZODB.fsIndex import fsIndex
-except ImportError:
-    def fsIndex():
-        return {}
-
 t32 = 1L << 32
 
 packed_version = "FS21"
@@ -161,14 +156,13 @@
         r = self._restore_index()
         if r is not None:
             self._used_index = 1 # Marker for testing
-            index, vindex, start, maxoid, ltid = r
+            index, vindex, start, ltid = r
 
             self._initIndex(index, vindex, tindex, tvindex,
                             oid2tid, toid2tid, toid2tid_delete)
             self._pos, self._oid, tid = read_index(
                 self._file, file_name, index, vindex, tindex, stop,
-                ltid=ltid, start=start, maxoid=maxoid,
-                read_only=read_only,
+                ltid=ltid, start=start, read_only=read_only,
                 )
         else:
             self._used_index = 0 # Marker for testing
@@ -221,9 +215,8 @@
         # stores 4000 objects, and each random seek + read takes 7ms
         # (that was approximately true on Linux and Windows tests in
         # mid-2003), that's 28 seconds just to find the old tids.
-        # XXX Probably better to junk this and redefine _index as mapping
-        # XXX oid to (offset, tid) pair, via a new memory-efficient
-        # XXX BTree type.
+        # TODO:  Probably better to junk this and redefine _index as mapping
+        # oid to (offset, tid) pair, via a new memory-efficient BTree type.
         self._oid2tid = oid2tid
         # oid->tid map to transactionally add to _oid2tid.
         self._toid2tid = toid2tid
@@ -243,12 +236,18 @@
     def _save_index(self):
         """Write the database index to a file to support quick startup."""
 
+        if self._is_read_only:
+            return
+
         index_name = self.__name__ + '.index'
         tmp_name = index_name + '.index_tmp'
 
         f=open(tmp_name,'wb')
         p=Pickler(f,1)
 
+        # Note:  starting with ZODB 3.2.6, the 'oid' value stored is ignored
+        # by the code that reads the index.  We still write it, so that
+        # .index files can still be read by older ZODBs.
         info={'index': self._index, 'pos': self._pos,
               'oid': self._oid, 'vindex': self._vindex}
 
@@ -340,11 +339,22 @@
 
     def _restore_index(self):
         """Load database index to support quick startup."""
+        # Returns (index, vindex, pos, tid), or None in case of
+        # error.
+        # Starting with ZODB 3.2.6, the 'oid' value stored in the index
+        # is ignored.
+        # The index returned is always an instance of fsIndex.  If the
+        # index cached in the file is a Python dict, it's converted to
+        # fsIndex here, and, if we're not in read-only mode, the .index
+        # file is rewritten with the converted fsIndex so we don't need to
+        # convert it again the next time.
         file_name=self.__name__
         index_name=file_name+'.index'
 
-        try: f=open(index_name,'rb')
-        except: return None
+        try:
+            f = open(index_name, 'rb')
+        except:
+            return None
 
         p=Unpickler(f)
 
@@ -356,34 +366,31 @@
             return None
         index = info.get('index')
         pos = info.get('pos')
-        oid = info.get('oid')
         vindex = info.get('vindex')
-        if index is None or pos is None or oid is None or vindex is None:
+        if index is None or pos is None or vindex is None:
             return None
         pos = long(pos)
 
-        if isinstance(index, DictType) and not self._is_read_only:
-            # Convert to fsIndex
+        if isinstance(index, DictType):
+            # Convert to fsIndex.
             newindex = fsIndex()
-            if type(newindex) is not type(index):
-                # And we have fsIndex
-                newindex.update(index)
-
-                # Now save the index
+            newindex.update(index)
+            index = newindex
+            if not self._is_read_only:
+                # Save the converted index.
                 f = open(index_name, 'wb')
                 p = Pickler(f, 1)
-                info['index'] = newindex
+                info['index'] = index
                 p.dump(info)
                 f.close()
-
-                # Now call this method again to get the new data
+                # Now call this method again to get the new data.
                 return self._restore_index()
 
         tid = self._sane(index, pos)
         if not tid:
             return None
 
-        return index, vindex, pos, oid, tid
+        return index, vindex, pos, tid
 
     def close(self):
         self._file.close()
@@ -1548,7 +1555,7 @@
 
 def read_index(file, name, index, vindex, tindex, stop='\377'*8,
                ltid=z64, start=4L, maxoid=z64, recover=0, read_only=0):
-    """Scan the entire file storage and recreate the index.
+    """Scan the file storage and update the index.
 
     Returns file position, max oid, and last transaction id.  It also
     stores index information in the three dictionary arguments.
@@ -1556,18 +1563,28 @@
     Arguments:
     file -- a file object (the Data.fs)
     name -- the name of the file (presumably file.name)
-    index -- dictionary, oid -> data record
-    vindex -- dictionary, oid -> data record for version data
-    tindex -- dictionary, oid -> data record
-       XXX tindex is cleared before return, so it will be empty
+    index -- fsIndex, oid -> data record file offset
+    vindex -- dictionary, oid -> data record offset for version data
+    tindex -- dictionary, oid -> data record offset
+              tindex is cleared before return
 
     There are several default arguments that affect the scan or the
-    return values.  XXX should document them.
+    return values.  TODO:  document them.
 
+    start -- the file position at which to start scanning for oids added
+             beyond the ones the passed-in indices know about.  The .index
+             file caches the highest ._pos FileStorage knew about when the
+             the .index file was last saved, and that's the intended value
+             to pass in for start; accept the default (and pass empty
+             indices) to recreate the index from scratch
+    maxoid -- ignored (it meant something prior to ZODB 3.2.6; the argument
+              still exists just so the signature of read_index() stayed the
+              same)
+
     The file position returned is the position just after the last
     valid transaction record.  The oid returned is the maximum object
-    id in the data.  The transaction id is the tid of the last
-    transaction.
+    id in `index`, or z64 if the index is empty.  The transaction id is the
+    tid of the last transaction, or ltid if the index is empty.
     """
 
     read = file.read
@@ -1577,14 +1594,15 @@
     fmt = TempFormatter(file)
 
     if file_size:
-        if file_size < start: raise FileStorageFormatError, file.name
+        if file_size < start:
+            raise FileStorageFormatError, file.name
         seek(0)
         if read(4) != packed_version:
             raise FileStorageFormatError, name
     else:
         if not read_only:
             file.write(packed_version)
-        return 4L, maxoid, ltid
+        return 4L, z64, ltid
 
     index_get=index.get
 
@@ -1651,18 +1669,18 @@
         if tid >= stop:
             break
 
-        tpos=pos
-        tend=tpos+tl
+        tpos = pos
+        tend = tpos + tl
 
         if status=='u':
             # Undone transaction, skip it
             seek(tend)
-            h=u64(read(8))
+            h = u64(read(8))
             if h != tl:
                 if recover: return tpos, None, None
                 panic('%s has inconsistent transaction length at %s',
                       name, pos)
-            pos=tend+8
+            pos = tend + 8
             continue
 
         pos = tpos+ TRANS_HDR_LEN + ul + dl + el
@@ -1690,27 +1708,34 @@
                     logger.warning("%s incorrect previous pointer at %s",
                                    name, pos)
 
-            pos=pos+dlen
+            pos += dlen
 
         if pos != tend:
-            if recover: return tpos, None, None
+            if recover:
+                return tpos, None, None
             panic("%s data records don't add up at %s",name,tpos)
 
         # Read the (intentionally redundant) transaction length
         seek(pos)
         h = u64(read(8))
         if h != tl:
-            if recover: return tpos, None, None
+            if recover:
+                return tpos, None, None
             panic("%s redundant transaction length check failed at %s",
                   name, pos)
-        pos=pos+8
+        pos += 8
 
-        if tindex: # avoid the pathological empty transaction case
-            _maxoid = max(tindex.keys()) # in 2.2, just max(tindex)
-            maxoid = max(_maxoid, maxoid)
-            index.update(tindex)
-            tindex.clear()
+        index.update(tindex)
+        tindex.clear()
 
+    # Caution:  fsIndex doesn't have an efficient __nonzero__ or __len__.
+    # That's why we do try/except instead.  fsIndex.maxKey() is fast.
+    try:
+        maxoid = index.maxKey()
+    except ValueError:
+        # The index is empty.
+        maxoid == z64
+
     return pos, maxoid, ltid
 
 

Modified: ZODB/branches/3.3/src/ZODB/tests/testFileStorage.py
===================================================================
--- ZODB/branches/3.3/src/ZODB/tests/testFileStorage.py	2005-02-25 20:46:19 UTC (rev 29301)
+++ ZODB/branches/3.3/src/ZODB/tests/testFileStorage.py	2005-02-25 21:51:33 UTC (rev 29302)
@@ -77,67 +77,66 @@
 
         self.assertEqual(self._storage._index.__class__, fsIndex)
 
-    # XXX We could really use some tests for sanity checking
+    # A helper for checking that when an .index contains a dict for the
+    # index, it's converted to an fsIndex when the file is opened.
+    def convert_index_to_dict(self):
+        # Convert the index in the current .index file to a Python dict.
+        # Return the index originally found.
+        import cPickle as pickle
 
-    def check_conversion_to_fsIndex_not_if_readonly(self):
+        f = open('FileStorageTests.fs.index', 'r+b')
+        p = pickle.Unpickler(f)
+        data = p.load()
+        index = data['index']
 
-        self.tearDown()
+        newindex = dict(index)
+        data['index'] = newindex
 
-        class OldFileStorage(ZODB.FileStorage.FileStorage):
-            def _newIndexes(self):
-                return {}, {}, {}, {}, {}, {}, {}
+        f.seek(0)
+        f.truncate()
+        p = pickle.Pickler(f, 1)
+        p.dump(data)
+        f.close()
+        return index
 
-
+    def check_conversion_to_fsIndex(self, read_only=False):
         from ZODB.fsIndex import fsIndex
 
-        # Hack FileStorage to create dictionary indexes
-        self._storage = OldFileStorage('FileStorageTests.fs')
-
-        self.assertEqual(type(self._storage._index), type({}))
+        # Create some data, and remember the index.
         for i in range(10):
             self._dostore()
+        oldindex_as_dict = dict(self._storage._index)
 
-        # Should save the index
+        # Save the index.
         self._storage.close()
 
-        self._storage = ZODB.FileStorage.FileStorage(
-            'FileStorageTests.fs', read_only=1)
-        self.assertEqual(type(self._storage._index), type({}))
+        # Convert it to a dict.
+        old_index = self.convert_index_to_dict()
+        self.assert_(isinstance(old_index, fsIndex))
+        new_index = self.convert_index_to_dict()
+        self.assert_(isinstance(new_index, dict))
 
-    def check_conversion_to_fsIndex(self):
+        # Verify it's converted to fsIndex in memory upon open.
+        self.open(read_only=read_only)
+        self.assert_(isinstance(self._storage._index, fsIndex))
 
-        self.tearDown()
+        # Verify it has the right content.
+        newindex_as_dict = dict(self._storage._index)
+        self.assertEqual(oldindex_as_dict, newindex_as_dict)
 
-        class OldFileStorage(ZODB.FileStorage.FileStorage):
-            def _newIndexes(self):
-                return {}, {}, {}, {}, {}, {}, {}
-
-
-        from ZODB.fsIndex import fsIndex
-
-        # Hack FileStorage to create dictionary indexes
-        self._storage = OldFileStorage('FileStorageTests.fs')
-
-        self.assertEqual(type(self._storage._index), type({}))
-        for i in range(10):
-            self._dostore()
-
-        oldindex = self._storage._index.copy()
-
-        # Should save the index
+        # Check that the type on disk has changed iff read_only is False.
         self._storage.close()
+        current_index = self.convert_index_to_dict()
+        if read_only:
+            self.assert_(isinstance(current_index, dict))
+        else:
+            self.assert_(isinstance(current_index, fsIndex))
 
-        self._storage = ZODB.FileStorage.FileStorage('FileStorageTests.fs')
-        self.assertEqual(self._storage._index.__class__, fsIndex)
-        self.failUnless(self._storage._used_index)
+    def check_conversion_to_fsIndex_readonly(self):
+        # Same thing, but the disk .index should continue to hold a
+        # Python dict.
+        self.check_conversion_to_fsIndex(read_only=True)
 
-        index = {}
-        for k, v in self._storage._index.items():
-            index[k] = v
-
-        self.assertEqual(index, oldindex)
-
-
     def check_save_after_load_with_no_index(self):
         for i in range(10):
             self._dostore()
@@ -146,7 +145,46 @@
         self.open()
         self.assertEqual(self._storage._saved, 1)
 
+    def check_index_oid_ignored(self):
+        # Prior to ZODB 3.2.6, the 'oid' value stored in the .index file
+        # was believed.  But there were cases where adding larger oids
+        # didn't update the FileStorage ._oid attribute -- the restore()
+        # method in particular didn't update it, and that's about the only
+        # method copyTransactionsFrom() uses.  A database copy created that
+        # way then stored an 'oid' of z64 in the .index file.  This created
+        # torturous problems, as when that file was opened, "new" oids got
+        # generated starting over from 0 again.
+        # Now the cached 'oid' value is ignored:  verify that this is so.
+        import cPickle as pickle
+        from ZODB.utils import z64
+        from ZODB.DB import DB
 
+        # Create some data.
+        db = DB(self._storage)
+        conn = db.open()
+        conn.root()['xyz'] = 1
+        get_transaction().commit()
+        true_max_oid = self._storage._oid
+
+        # Save away the index, and poke in a bad 'oid' value by hand.
+        db.close()
+        f = open('FileStorageTests.fs.index', 'r+b')
+        p = pickle.Unpickler(f)
+        data = p.load()
+        saved_oid = data['oid']
+        self.assertEqual(true_max_oid, saved_oid)
+        data['oid'] = z64
+        f.seek(0)
+        f.truncate()
+        p = pickle.Pickler(f, 1)
+        p.dump(data)
+        f.close()
+
+        # Verify that we get the correct oid again when we reopen, despite
+        # that we stored nonsense in the .index file's 'oid'.
+        self.open()
+        self.assertEqual(self._storage._oid, true_max_oid)
+
     # This would make the unit tests too slow
     # check_save_after_load_that_worked_hard(self)