[Zope-Checkins] CVS: ZODB3/zdaemon - zdaemon.py:1.3

Guido van Rossum guido@python.org
Fri, 8 Nov 2002 16:51:23 -0500


Update of /cvs-repository/ZODB3/zdaemon
In directory cvs.zope.org:/tmp/cvs-serv25088

Modified Files:
	zdaemon.py 
Log Message:
Implement backoff.  By default, after 10 rapid restarts, we exit; use
-f to keep trying forever.


=== ZODB3/zdaemon/zdaemon.py 1.2 => 1.3 ===
--- ZODB3/zdaemon/zdaemon.py:1.2	Fri Nov  8 14:16:30 2002
+++ ZODB3/zdaemon/zdaemon.py	Fri Nov  8 16:51:23 2002
@@ -6,10 +6,10 @@
 Usage: python zdaemon.py [zdaemon-options] program [program-arguments]
 
 Options:
+  -b SECONDS -- set backoff limit to SECONDS (default 10; see below)
   -d -- run as a proper daemon; fork a background process, close files etc.
+  -f -- run forever (by default, exit when the backoff limit is exceeded)
   -h -- print usage message and exit
-
-Arguments:
   program [program-arguments] -- an arbitrary application to run
 
 This daemon manager has two purposes: it restarts the application when
@@ -21,15 +21,24 @@
 it is not restarted.  Any other form of termination (either being
 killed by a signal or exiting with an exit status other than 2) causes
 it to be restarted.
+
+Backoff limit: when the application exits (nearly) immediately after a
+restart, the daemon manager starts slowing down by delaying between
+restarts.  The delay starts at 1 second and is increased by one on
+each restart up to the backoff limit given by the -b option; it is
+reset when the application runs for more than the backoff limit
+seconds.  By default, when the delay reaches the backoff limit, the
+daemon manager exits (under the assumption that the application has a
+persistent fault).  The -f (forever) option prevents this exit; use it
+when you expect that a temporary external problem (such as a network
+outage or an overfull disk) may prevent the application from starting
+but you want the daemon manager to keep trying.
+
 """
 
 """
 XXX TO DO
 
-- A parametrizable "governor" on the automatic restart, limiting the
-  frequency of restarts and possible stopping altogether if the
-  application fails too often
-
 - A way to stop both the daemon manager and the application.
 
 - A way to restart the application.
@@ -40,8 +49,9 @@
 """
 
 import os
-assert os.name == "posix" # This code makes Unix-specific assumptions
+assert os.name == "posix" # This code has many Unix-specific assumptions
 import sys
+import time
 import getopt
 import signal
 from stat import ST_MODE
@@ -50,10 +60,14 @@
 
 class Daemonizer:
 
+    # Settable options
+    daemon = 0
+    forever = 0
+    backofflimit = 10
+
     def __init__(self):
         self.filename = None
         self.args = []
-        self.daemon = 0
 
     def main(self, args=None):
         self.prepare(args)
@@ -64,7 +78,7 @@
             args = sys.argv[1:]
         self.blather("args=%s" % repr(args))
         try:
-            opts, args = getopt.getopt(args, "dh")
+            opts, args = getopt.getopt(args, "b:dfh")
         except getopt.error, msg:
             self.usage(str(msg))
         self.parseoptions(opts)
@@ -73,11 +87,18 @@
     def parseoptions(self, opts):
         self.info("opts=%s" % repr(opts))
         for o, a in opts:
+            if o == "-b":
+                try:
+                    self.backofflimit = float(a)
+                except:
+                    self.usage("invalid number: %s" % repr(a))
+            if o == "-d":
+                self.daemon += 1
+            if o == "-f":
+                self.forever += 1
             if o == "-h":
                 print __doc__,
                 self.exit()
-            if o == "-d":
-                self.daemon += 1
 
     def setprogram(self, args):
         if not args:
@@ -133,7 +154,7 @@
 
     def sigexit(self, sig, frame):
         self.info("daemon manager killed by signal %s(%d)" %
-                  (self.signalname(sig), sig))
+                  (self.signame(sig), sig))
         self.exit(1)
 
     def daemonize(self):
@@ -155,8 +176,32 @@
     def runforever(self):
         self.info("daemon manager started")
         while 1:
+            self.governor()
             self.forkandexec()
 
+    backoff = 0
+    lasttime = None
+
+    def governor(self):
+        # Back off if respawning too often
+        if not self.lasttime:
+            pass
+        elif time.time() - self.lasttime < self.backofflimit:
+            # Exited rather quickly; slow down the restarts
+            self.backoff += 1
+            if self.backoff >= self.backofflimit:
+                if self.forever:
+                    self.backoff = self.backofflimit
+                else:
+                    self.problem("restarting too often; quit")
+                    self.exit(1)
+            self.info("sleep %s to avoid rapid restarts" % self.backoff)
+            time.sleep(self.backoff)
+        else:
+            # Reset the backoff timer
+            self.backoff = 0
+        self.lasttime = time.time()
+
     def forkandexec(self):
         pid = os.fork()
         if pid != 0:
@@ -185,13 +230,14 @@
             msg = "pid %d: exit status %s" % (pid, es)
             if es == 0:
                 self.info(msg)
+            elif es == 2:
+                self.problem(msg)
+                self.exit(es)
             else:
                 self.warning(msg)
-                if es == 2:
-                    self.exit(es)
         elif os.WIFSIGNALED(sts):
             signum = os.WTERMSIG(sts)
-            signame = self.signalname(signum)
+            signame = self.signame(signum)
             msg = ("pid %d: terminated by signal %s(%s)" %
                    (pid, signame, signum))
             if hasattr(os, "WCOREDUMP"):
@@ -202,15 +248,12 @@
                 msg += " (core dumped)"
             self.warning(msg)
         else:
-            # XXX what should we do here?
-            signum = os.WSTOPSIG(sts)
-            signame = self.signalname(signum)
-            msg = "pid %d: stopped by signal %s(%s)" % (pid, signame, signum)
+            msg = "pid %d: unknown termination cause 0x%04x" % (pid, sts)
             self.warning(msg)
 
     signames = None
 
-    def signalname(self, sig):
+    def signame(self, sig):
         """Return the symbolic name for signal sig.
 
         Returns 'unknown' if there is no SIG name bound to sig in the
@@ -233,8 +276,8 @@
     # Error handling
 
     def usage(self, msg):
+        self.problem(str(msg))
         self.errwrite("Error: %s\n" % str(msg))
-        self.error(str(msg))
         self.errwrite("For help, use zdaemon.py -h\n")
         self.exit(2)
 
@@ -261,7 +304,7 @@
     def warning(self, msg):
         self.log(msg, zLOG.WARNING)
 
-    def error(self, msg):
+    def problem(self, msg):
         self.log(msg, zLOG.ERROR)
 
     def panic(self, msg):