[Zodb-checkins] CVS: ZODB3/Tools - migrate.py:1.1

Barry Warsaw barry@wooz.org
Tue, 14 Jan 2003 12:23:01 -0500


Update of /cvs-repository/ZODB3/Tools
In directory cvs.zope.org:/tmp/cvs-serv4500

Added Files:
	migrate.py 
Log Message:
A storage migration and information collecting script.  Integrated
with StorageTypes.


=== Added File ZODB3/Tools/migrate.py ===
#! /usr/bin/env python
##############################################################################
#
# Copyright (c) 2001, 2002, 2003 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################

"""A script to gather statistics while doing a storage migration.

This is very similar to a standard storage's copyTransactionsFrom() method,
except that it's geared to run as a script, and it collects useful pieces of
information as it's working.  This script can be used to stress test a storage
since it blasts transactions at it as fast as possible.  You can get a good
sense of the performance of a storage by running this script.

Actually it just counts the size of pickles in the transaction via the
iterator protocol, so storage overheads aren't counted.

Usage: %(PROGRAM)s [options] [source-storage-args] [destination-storage-args]
Options:
    -S sourcetype
    --stype=sourcetype
        This is the name of a recognized type for the source database.  Use -T
        to print out the known types.  Defaults to "file".

    -D desttype
    --dtype=desttype
        This is the name of the recognized type for the destination database.
        Use -T to print out the known types.  Defaults to "file".

    -o filename
    --output=filename
        Print results in filename, otherwise stdout.

    -m txncount
    --max=txncount
        Stop after committing txncount transactions.

    -k txncount
    --skip=txncount
        Skip the first txncount transactions.

    -p/--profile
        Turn on specialized profiling.

    -t/--timestamps
        Print tids as timestamps.

    -T/--storage_types
        Print all the recognized storage types and exit.

    -v/--verbose
        Turns on verbose output.  Multiple -v options increase the verbosity.

    -h/--help
        Print this message and exit.

Positional arguments:

    source-storage-args:
        Semicolon separated list of arguments for the source storage, as
        key=val pairs.  E.g. "file_name=Data.fs;read_only=1"

    destination-storage-args:
        Comma separated list of arguments for the source storage, as key=val
        pairs.  E.g. "name=full;frequency=3600"
"""

import os
import re
import sys
import time
import errno
import getopt
import marshal
import profile
import traceback

import ZODB
from ZODB import utils
from ZODB import StorageTypes
from ZODB.TimeStamp import TimeStamp

PROGRAM = sys.argv[0]
ZERO = '\0'*8

try:
    True, False
except NameError:
    True = 1
    False = 0



def usage(code, msg=''):
    print >> sys.stderr, __doc__ % globals()
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)


def error(code, msg):
    print >> sys.stderr, msg
    print "use --help for usage message"
    sys.exit(code)



def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            'hvo:pm:k:D:S:Tt',
            ['help', 'verbose',
             'output=', 'profile', 'storage_types',
             'max=', 'skip=', 'dtype=', 'stype=', 'timestamps'])
    except getopt.error, msg:
        error(2, msg)

    class Options:
        stype = 'FileStorage'
        dtype = 'FileStorage'
        verbose = 0
        outfile = None
        profilep = False
        maxtxn = -1
        skiptxn = -1
        timestamps = False

    options = Options()

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-v', '--verbose'):
            options.verbose += 1
        elif opt in ('-T', '--storage_types'):
            print_types()
            sys.exit(0)
        elif opt in ('-S', '--stype'):
            options.stype = arg
        elif opt in ('-D', '--dtype'):
            options.dtype = arg
        elif opt in ('-o', '--output'):
            options.outfile = arg
        elif opt in ('-p', '--profile'):
            options.profilep = True
        elif opt in ('-m', '--max'):
            options.maxtxn = int(arg)
        elif opt in ('-k', '--skip'):
            options.skiptxn = int(arg)
        elif opt in ('-t', '--timestamps'):
            options.timestamps = True

    if len(args) > 2:
        error(2, "too many arguments")

    srckws = {}
    if len(args) > 0:
        srcargs = args[0]
        for kv in re.split(r';\s*', srcargs):
            key, val = kv.split('=')
            srckws[key] = val

    destkws = {}
    if len(args) > 1:
        destargs = args[1]
        for kv in re.split(r';\s*', destargs):
            key, val = kv.split('=')
            destkws[key] = val

    if options.stype not in StorageTypes.storage_types.keys():
        usage(2, 'Source database type must be provided')
    if options.dtype not in StorageTypes.storage_types.keys():
        usage(2, 'Destination database type must be provided')

    # Open the output file
    if options.outfile is None:
        options.outfp = sys.stdout
        options.outclosep = False
    else:
        options.outfp = open(options.outfile, 'w')
        options.outclosep = True

    if options.verbose > 0:
        print 'Opening source database...'
    modname, sconv = StorageTypes.storage_types[options.stype]
    kw = sconv(**srckws)
    __import__(modname)
    sclass = getattr(sys.modules[modname], options.stype)
    srcdb = sclass(**kw)

    if options.verbose > 0:
        print 'Opening destination database...'
    modname, dconv = StorageTypes.storage_types[options.dtype]
    kw = dconv(**destkws)
    __import__(modname)
    dclass = getattr(sys.modules[modname], options.dtype)
    dstdb = dclass(**kw)

    try:
        t0 = time.time()
        doit(srcdb, dstdb, options)
        t1 = time.time()
        if options.verbose > 0:
            print 'Migration time:          %8.3f' % (t1-t0)
    finally:
        # Done
        srcdb.close()
        dstdb.close()
        if options.outclosep:
            options.outfp.close()



def doit(srcdb, dstdb, options):
    outfp = options.outfp
    profilep = options.profilep
    verbose = options.verbose
    # some global information
    largest_pickle = 0
    largest_txn_in_size = 0
    largest_txn_in_objects = 0
    total_pickle_size = 0L
    total_object_count = 0
    # Ripped from BaseStorage.copyTransactionsFrom()
    ts = None
    ok = True
    prevrevids = {}
    counter = 0
    skipper = 0
    if options.timestamps:
        print "%4s. %26s %6s %8s %5s %5s %5s %5s %5s" % (
            "NUM", "TID AS TIMESTAMP", "OBJS", "BYTES",
            # Does anybody know what these times mean?
            "t4-t0", "t1-t0", "t2-t1", "t3-t2", "t4-t3")
    else:
        print "%4s. %20s %6s %8s %6s %6s %6s %6s %6s" % (
            "NUM", "TRANSACTION ID", "OBJS", "BYTES",
            # Does anybody know what these times mean?
            "t4-t0", "t1-t0", "t2-t1", "t3-t2", "t4-t3")
    for txn in srcdb.iterator():
        skipper += 1
        if skipper <= options.skiptxn:
            continue
        counter += 1
        if counter > options.maxtxn >= 0:
            break
        tid = txn.tid
        if ts is None:
            ts = TimeStamp(tid)
        else:
            t = TimeStamp(tid)
            if t <= ts:
                if ok:
                    print >> sys.stderr, \
                          'Time stamps are out of order %s, %s' % (ts, t)
                    ok = False
                    ts = t.laterThan(ts)
                    tid = `ts`
                else:
                    ts = t
                    if not ok:
                        print >> sys.stderr, \
                              'Time stamps are back in order %s' % t
                        ok = True
        if verbose > 1:
            print ts

        prof = None
        if profilep and (counter % 100) == 0:
            prof = profile.Profile()
        objects = 0
        size = 0
        newrevids = RevidAccumulator()
        t0 = time.time()
        dstdb.tpc_begin(txn, tid, txn.status)
        t1 = time.time()
        for r in txn:
            oid = r.oid
            objects += 1
            thissize = len(r.data)
            size += thissize
            if thissize > largest_pickle:
                largest_pickle = thissize
            if verbose > 1:
                if not r.version:
                    vstr = 'norev'
                else:
                    vstr = r.version
                print utils.U64(oid), vstr, len(r.data)
            oldrevid = prevrevids.get(oid, ZERO)
            result = dstdb.store(oid, oldrevid, r.data, r.version, txn)
            newrevids.store(oid, result)
        t2 = time.time()
        result = dstdb.tpc_vote(txn)
        t3 = time.time()
        newrevids.tpc_vote(result)
        prevrevids.update(newrevids.get_dict())
        # Profile every 100 transactions
        if prof:
            prof.runcall(dstdb.tpc_finish, txn)
        else:
            dstdb.tpc_finish(txn)
        t4 = time.time()

        # record the results
        if objects > largest_txn_in_objects:
            largest_txn_in_objects = objects
        if size > largest_txn_in_size:
            largest_txn_in_size = size
        if options.timestamps:
            tidstr = str(TimeStamp(tid))
            format = "%4d. %26s %6d %8d %5.3f %5.3f %5.3f %5.3f %5.3f"
        else:
            tidstr = utils.U64(tid)
            format = "%4d. %20s %6d %8d %6.4f %6.4f %6.4f %6.4f %6.4f"
        print >> outfp, format % (skipper, tidstr, objects, size,
                                  t4-t0, t1-t0, t2-t1, t3-t2, t4-t3)
        total_pickle_size += size
        total_object_count += objects

        if prof:
            prof.create_stats()
            fp = open('profile-%02d.txt' % (counter / 100), 'wb')
            marshal.dump(prof.stats, fp)
            fp.close()
    print >> outfp, "Largest pickle:          %8d" % largest_pickle
    print >> outfp, "Largest transaction:     %8d" % largest_txn_in_size
    print >> outfp, "Largest object count:    %8d" % largest_txn_in_objects
    print >> outfp, "Total pickle size: %14d" % total_pickle_size
    print >> outfp, "Total object count:      %8d" % total_object_count



# helper to deal with differences between old-style store() return and
# new-style store() return that supports ZEO
import types

class RevidAccumulator:

    def __init__(self):
        self.data = {}

    def _update_from_list(self, list):
        for oid, serial in list:
            if not isinstance(serial, types.StringType):
                raise serial
            self.data[oid] = serial

    def store(self, oid, result):
        if isinstance(result, types.StringType):
            self.data[oid] = result
        elif result is not None:
            self._update_from_list(result)

    def tpc_vote(self, result):
        if result is not None:
            self._update_from_list(result)

    def get_dict(self):
        return self.data



if __name__ == '__main__':
    main()