[Zodb-checkins] CVS: ZODB3/Tools - repozo.py:1.1.2.2
Barry Warsaw
barry@wooz.org
Tue, 11 Feb 2003 13:26:52 -0500
Update of /cvs-repository/ZODB3/Tools
In directory cvs.zope.org:/tmp/cvs-serv3849
Modified Files:
Tag: ZODB3-3_1-branch
repozo.py
Log Message:
Port --quick and --gzip flag from 3.2 head.
=== ZODB3/Tools/repozo.py 1.1.2.1 => 1.1.2.2 ===
--- ZODB3/Tools/repozo.py:1.1.2.1 Fri Jan 31 10:52:41 2003
+++ ZODB3/Tools/repozo.py Tue Feb 11 13:26:52 2003
@@ -26,7 +26,6 @@
-h / --help
Print this text and exit
-Flags for --backup and --recover:
-r dir
--repository=dir
Repository directory containing the backup files
@@ -39,6 +38,15 @@
-F / --full
Force a full backup
+ -Q / --quick
+ Verify via md5 checksum only the last incremental written. This
+ significantly reduces the disk i/o at the (theoretical) cost of
+ inconsistency.
+
+ -z / --gzip
+ Compress with gzip the backup files. Uses the default zlib
+ compression level.
+
Flags for --recover:
-D str
--date=str
@@ -58,7 +66,9 @@
import os
import sys
import md5
+import gzip
import time
+import errno
import getopt
from ZODB.FileStorage import FileStorage
@@ -102,10 +112,10 @@
def parseargs():
global VERBOSE
try:
- opts, args = getopt.getopt(sys.argv[1:], 'BRvhf:r:FD:o:',
+ opts, args = getopt.getopt(sys.argv[1:], 'BRvhf:r:FD:o:Qz',
['backup', 'recover', 'verbose', 'help',
'file=', 'repository=', 'full', 'date=',
- 'output='])
+ 'output=', 'quick', 'gzip'])
except getopt.error, msg:
usage(1, msg)
@@ -116,12 +126,16 @@
full = False
date = None
output = None
+ quick = False
+ gzip = False
options = Options()
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
+ elif opt in ('-v', '--verbose'):
+ VERBOSE = True
elif opt in ('-R', '--recover'):
if options.mode is not None:
usage(1, '-B and -R are mutually exclusive')
@@ -130,8 +144,8 @@
if options.mode is not None:
usage(1, '-B and -R are mutually exclusive')
options.mode = BACKUP
- elif opt in ('-v', '--verbose'):
- VERBOSE = True
+ elif opt in ('-Q', '--quick'):
+ options.quick = True
elif opt in ('-f', '--file'):
options.file = arg
elif opt in ('-r', '--repository'):
@@ -142,6 +156,8 @@
options.date = arg
elif opt in ('-o', '--output'):
options.output = arg
+ elif opt in ('-z', '--gzip'):
+ options.gzip = True
# Any other arguments are invalid
if args:
@@ -169,12 +185,12 @@
# Do something with a run of bytes from a file
-def dofile(func, fp, n):
+def dofile(func, fp, n=None):
bytesread = 0
stop = False
chunklen = READCHUNK
while not stop:
- if chunklen + bytesread > n:
+ if n is not None and chunklen + bytesread > n:
chunklen = n - bytesread
stop = True
data = fp.read(chunklen)
@@ -185,27 +201,32 @@
return bytesread
-def checksum(filename, n):
+def checksum(fp, n):
# Checksum the first n bytes of the specified file
sum = md5.new()
- fp = open(filename, 'rb')
def func(data):
sum.update(data)
dofile(func, fp, n)
return sum.hexdigest()
-def copyfile(src, dst, start, n):
+def copyfile(options, dst, start, n):
# Copy bytes from file src, to file dst, starting at offset start, for n
# length of bytes
- ifp = open(src, 'rb')
+ sum = md5.new()
+ ifp = open(options.file, 'rb')
ifp.seek(start)
- ofp = open(dst, 'wb')
+ if options.gzip:
+ ofp = gzip.open(dst, 'wb')
+ else:
+ ofp = open(dst, 'wb')
def func(data):
+ sum.update(data)
ofp.write(data)
dofile(func, ifp, n)
ofp.close()
ifp.close()
+ return sum.hexdigest()
def concat(files, ofp=None):
@@ -219,8 +240,12 @@
ofp.write(data)
bytesread = 0
for f in files:
- ifp = open(f, 'rb')
- bytesread += dofile(func, ifp, os.path.getsize(f))
+ # Auto uncompress
+ if f.endswith('fsz'):
+ ifp = gzip.open(f, 'rb')
+ else:
+ ifp = open(f, 'rb')
+ bytesread += dofile(func, ifp)
ifp.close()
if ofp:
ofp.close()
@@ -233,6 +258,8 @@
ext = '.fs'
else:
ext = '.deltafs'
+ if options.gzip:
+ ext += 'z'
t = time.gmtime()[:6] + (ext,)
return '%04d-%02d-%02d-%02d-%02d-%02d%s' % t
@@ -255,7 +282,7 @@
root, ext = os.path.splitext(file)
if root <= when:
needed.append(file)
- if ext == '.fs':
+ if ext in ('.fs', '.fsz'):
break
# Make the file names relative to the repository directory
needed = [os.path.join(options.repository, f) for f in needed]
@@ -270,6 +297,32 @@
return needed
+def scandat(repofiles):
+ # Scan the .dat file corresponding to the last full backup performed.
+ # Return the filename, startpos, endpos, and sum of the last incremental.
+ # If all is a list, then append file name and md5sums to the list.
+ fullfile = repofiles[0]
+ datfile = os.path.splitext(fullfile)[0] + '.dat'
+ # If the .dat file is missing, we have to do a full backup
+ fn = startpos = endpos = sum = None
+ try:
+ fp = open(datfile)
+ except IOError, e:
+ if e.errno <> errno.ENOENT:
+ raise
+ else:
+ while True:
+ line = fp.readline()
+ if not line:
+ break
+ # We only care about the last one
+ fn, startpos, endpos, sum = line.split()
+ fp.close()
+ startpos = long(startpos)
+ endpos = long(endpos)
+ return fn, startpos, endpos, sum
+
+
def do_full_backup(options):
# Find the file position of the last completed transaction.
@@ -286,10 +339,16 @@
if os.path.exists(dest):
print >> sys.stderr, 'Cannot overwrite existing file:', dest
sys.exit(2)
- copyfile(options.file, dest, 0, pos)
+ log('writing full backup: %s bytes to %s', pos, dest)
+ sum = copyfile(options, dest, 0, pos)
+ # Write the data file for this full backup
+ datfile = os.path.splitext(dest)[0] + '.dat'
+ fp = open(datfile, 'w')
+ print >> fp, dest, 0, pos, sum
+ fp.close()
-def do_incremental_backup(options, dstfile, reposz):
+def do_incremental_backup(options, reposz, repofiles):
# Find the file position of the last completed transaction.
fs = FileStorage(options.file, read_only=True)
# Note that the FileStorage ctor calls read_index() which scans the file
@@ -305,7 +364,16 @@
print >> sys.stderr, 'Cannot overwrite existing file:', dest
sys.exit(2)
log('writing incremental: %s bytes to %s', pos-reposz, dest)
- copyfile(options.file, dest, reposz, pos)
+ sum = copyfile(options, dest, reposz, pos)
+ # The first file in repofiles points to the last full backup. Use this to
+ # get the .dat file and append the information for this incrementatl to
+ # that file.
+ fullfile = repofiles[0]
+ datfile = os.path.splitext(fullfile)[0] + '.dat'
+ # This .dat file better exist. Let the exception percolate if not.
+ fp = open(datfile, 'a')
+ print >> fp, dest, reposz, pos, sum
+ fp.close()
def do_backup(options):
@@ -315,39 +383,72 @@
log('doing a full backup')
do_full_backup(options)
return
- # See if we can do an incremental, based on the files that already exist.
- # This call of concat() will not write an output file.
- reposz, reposum = concat(repofiles)
- log('repository state: %s bytes, md5: %s', reposz, reposum)
srcsz = os.path.getsize(options.file)
- # Get the md5 checksum of the source file, up to two file positions: the
- # entire size of the file, and up to the file position of the last
- # incremental backup.
- srcsum = checksum(options.file, srcsz)
- srcsum_backedup = checksum(options.file, reposz)
- log('current state : %s bytes, md5: %s', srcsz, srcsum)
- log('backed up state : %s bytes, md5: %s', reposz, srcsum_backedup)
- # Has nothing changed?
- if srcsz == reposz and srcsum == reposum:
- log('No changes, nothing to do')
- return
- # Has the file shrunk (probably because of a pack)?
- if srcsz < reposz:
- log('file shrunk, possibly because of a pack (full backup)')
- do_full_backup(options)
- return
- # The source file is larger than the repository. If the md5 checksums
- # match, then we know we can do an incremental backup. If they don't,
- # then perhaps the file was packed at some point (or a non-transactional
- # undo was performed, but this is deprecated). Only do a full backup if
- # forced to.
- #
- # XXX For ZODB4, this needs to take into account the storage metadata
- # header that FileStorage has grown at the front of the file.
- if reposum == srcsum_backedup:
- incrdest = gen_filename(options)
- do_incremental_backup(options, incrdest, reposz)
- return
+ if options.quick:
+ fn, startpos, endpos, sum = scandat(repofiles)
+ # If the .dat file was missing, or was empty, do a full backup
+ if (fn, startpos, endpos, sum) == (None, None, None, None):
+ log('missing or empty .dat file (full backup)')
+ do_full_backup(options)
+ return
+ # Has the file shrunk, possibly because of a pack?
+ if srcsz < endpos:
+ log('file shrunk, possibly because of a pack (full backup)')
+ do_full_backup(options)
+ return
+ # Now check the md5 sum of the source file, from the last
+ # incremental's start and stop positions.
+ srcfp = open(options.file)
+ srcfp.seek(startpos)
+ srcsum = checksum(srcfp, endpos-startpos)
+ log('last incremental file: %s', fn)
+ log('last incremental checksum: %s', sum)
+ log('source checksum range: [%s..%s], sum: %s',
+ startpos, endpos, srcsum)
+ if sum == srcsum:
+ log('doing incremental, starting at: %s', endpos)
+ do_incremental_backup(options, endpos, repofiles)
+ return
+ else:
+ # This was is much slower, and more disk i/o intensive, but it's also
+ # more accurate since it checks the actual existing files instead of
+ # the information in the .dat file.
+ #
+ # See if we can do an incremental, based on the files that already
+ # exist. This call of concat() will not write an output file.
+ reposz, reposum = concat(repofiles)
+ log('repository state: %s bytes, md5: %s', reposz, reposum)
+ # Get the md5 checksum of the source file, up to two file positions:
+ # the entire size of the file, and up to the file position of the last
+ # incremental backup.
+ srcfp = open(options.file)
+ srcsum = checksum(srcfp, srcsz)
+ srcfp.seek(0)
+ srcsum_backedup = checksum(srcfp, reposz)
+ srcfp.close()
+ log('current state : %s bytes, md5: %s', srcsz, srcsum)
+ log('backed up state : %s bytes, md5: %s', reposz, srcsum_backedup)
+ # Has nothing changed?
+ if srcsz == reposz and srcsum == reposum:
+ log('No changes, nothing to do')
+ return
+ # Has the file shrunk, probably because of a pack?
+ if srcsz < reposz:
+ log('file shrunk, possibly because of a pack (full backup)')
+ do_full_backup(options)
+ return
+ # The source file is larger than the repository. If the md5 checksums
+ # match, then we know we can do an incremental backup. If they don't,
+ # then perhaps the file was packed at some point (or a
+ # non-transactional undo was performed, but this is deprecated). Only
+ # do a full backup if forced to.
+ #
+ # XXX For ZODB4, this needs to take into account the storage metadata
+ # header that FileStorage has grown at the front of the file.
+ if reposum == srcsum_backedup:
+ log('doing incremental, starting at: %s', reposz)
+ do_incremental_backup(options, reposz, repofiles)
+ return
# The checksums don't match, meaning the front of the source file has
# changed. We'll need to do a full backup in that case.
log('file changed, possibly because of a pack (full backup)')