Commit ab191165 authored by Leif Walsh's avatar Leif Walsh Committed by Yoni Fogel

better logging, exception handling for stress test runner

git-svn-id: file:///svn/toku/tokudb@42103 c7de825b-a66e-492c-adef-691d508d4ae1
parent 178ac5b7
...@@ -14,22 +14,23 @@ By default, we stop everything, update from svn, rebuild, and restart the ...@@ -14,22 +14,23 @@ By default, we stop everything, update from svn, rebuild, and restart the
tests once a day. tests once a day.
""" """
import logging
import os
import sys
import time
from glob import glob from glob import glob
from logging import debug, info, warning, error, exception
from optparse import OptionParser from optparse import OptionParser
import os
from Queue import Queue from Queue import Queue
from random import randrange from random import randrange
from resource import setrlimit, RLIMIT_CORE from resource import setrlimit, RLIMIT_CORE
from shutil import copy, copytree, move, rmtree from shutil import copy, copytree, move, rmtree
from signal import signal, SIGHUP, SIGINT, SIGPIPE, SIGALRM, SIGTERM from signal import signal, SIGHUP, SIGINT, SIGPIPE, SIGALRM, SIGTERM
from subprocess import call, Popen, PIPE, STDOUT from subprocess import call, Popen, PIPE, STDOUT
import sys
from tempfile import mkdtemp, mkstemp from tempfile import mkdtemp, mkstemp
import time from thread import get_ident
import thread
from threading import Event, Thread, Timer from threading import Event, Thread, Timer
import threading
import traceback
__version__ = '$Id$' __version__ = '$Id$'
__copyright__ = """Copyright (c) 2007-2012 Tokutek Inc. All rights reserved. __copyright__ = """Copyright (c) 2007-2012 Tokutek Inc. All rights reserved.
...@@ -48,12 +49,6 @@ recover_testnames = ['recover-test_stress1.tdb', ...@@ -48,12 +49,6 @@ recover_testnames = ['recover-test_stress1.tdb',
'recover-test_stress2.tdb', 'recover-test_stress2.tdb',
'recover-test_stress3.tdb'] 'recover-test_stress3.tdb']
verbose = False
debug = False
def dbg(a):
if debug:
print >>sys.stderr, a
def setlimits(): def setlimits():
setrlimit(RLIMIT_CORE, (-1, -1)) setrlimit(RLIMIT_CORE, (-1, -1))
os.nice(7) os.nice(7)
...@@ -74,12 +69,18 @@ class TestRunnerBase(object): ...@@ -74,12 +69,18 @@ class TestRunnerBase(object):
self.csize = csize self.csize = csize
self.test_time = test_time self.test_time = test_time
self.savedir = savedir self.savedir = savedir
self.logfile = os.path.join(log, ('%s-%d-%d' %
(self.execf, self.tsize, self.csize)))
self.nruns = 0
self.env = os.environ self.env = os.environ
self.env['LD_LIBRARY_PATH'] = (os.path.join(self.tokudb, 'lib') + self.env['LD_LIBRARY_PATH'] = '%s:%s' % (os.path.join(self.tokudb, 'lib'),
':' + self.env['LD_LIBRARY_PATH']) self.env['LD_LIBRARY_PATH'])
loggername = '%s-%d-%d' % (self.execf, self.tsize, self.csize)
self.logger = logging.getLogger(loggername)
self.logger.propagate = False
self.logger.setLevel(logging.INFO)
logfile = os.path.join(log, loggername)
self.logger.addHandler(logging.FileHandler(logfile))
self.nruns = 0
self.rundir = None self.rundir = None
self.tmplog = None self.tmplog = None
self.tmplogname = None self.tmplogname = None
...@@ -106,7 +107,7 @@ class TestRunnerBase(object): ...@@ -106,7 +107,7 @@ class TestRunnerBase(object):
self.envdir = ('../%s-%d-%d-%d-%d-%d.dir' % self.envdir = ('../%s-%d-%d-%d-%d-%d.dir' %
(self.execf, self.tsize, self.csize, (self.execf, self.tsize, self.csize,
self.ptquery, self.update, thread.get_ident())) self.ptquery, self.update, get_ident()))
os.mkdir(self.envdir) os.mkdir(self.envdir)
try: try:
...@@ -121,6 +122,7 @@ class TestRunnerBase(object): ...@@ -121,6 +122,7 @@ class TestRunnerBase(object):
except TestFailure: except TestFailure:
savedir = self.save() savedir = self.save()
self.print_failure(savedir) self.print_failure(savedir)
warning('Saved environment to %s', savedir)
else: else:
self.print_success() self.print_success()
finally: finally:
...@@ -138,13 +140,9 @@ class TestRunnerBase(object): ...@@ -138,13 +140,9 @@ class TestRunnerBase(object):
self.nruns += 1 self.nruns += 1
def save(self): def save(self):
savepfx = ("%s-%s-%d-%d-%d-%d-%s-" % (self.execf, savepfx = ('%s-%s-%d-%d-%d-%d-%s-' %
self.rev, (self.execf, self.rev, self.tsize, self.csize,
self.tsize, self.ptquery, self.update, self.phase))
self.csize,
self.ptquery,
self.update,
self.phase))
savedir = mkdtemp(dir=self.savedir, prefix=savepfx) savedir = mkdtemp(dir=self.savedir, prefix=savepfx)
def targetfor(path): def targetfor(path):
return os.path.join(savedir, os.path.basename(path)) return os.path.join(savedir, os.path.basename(path))
...@@ -160,49 +158,42 @@ class TestRunnerBase(object): ...@@ -160,49 +158,42 @@ class TestRunnerBase(object):
copy(lib, targetfor(lib)) copy(lib, targetfor(lib))
return savedir return savedir
def timestr(self, phase):
timeval = self.times[phase]
if timeval == 0:
return ''
else:
return time.ctime(timeval)
def logstr(self, passed):
if passed:
result = 'PASS'
else:
result = 'FAIL'
return ('"%s",%s,%d,%d,%d,%d,%s,%s,%s,%s' %
(self.execf, self.rev, self.tsize, self.csize,
self.ptquery, self.update,
self.timestr(0), self.timestr(1), self.timestr(2),
result))
def infostr(self, passed):
if passed:
result = 'PASSED'
else:
result = 'FAILED'
return ('[PASS=%d FAIL=%d] PASSED %s tsize=%d csize=%d ptquery=%d update=%d' %
(self.scheduler.passed, self.scheduler.failed,
self.execf, self.tsize, self.csize, self.ptquery, self.update))
def print_success(self): def print_success(self):
f = open(self.logfile, 'w+')
print >>f, ('"%s",%s,%d,%d,%d,%d,%s,%s,%s,PASS' %
(self.execf,
self.rev,
self.tsize,
self.csize,
self.ptquery,
self.update,
time.ctime(self.times[0]),
time.ctime(self.times[1]),
time.ctime(self.times[2])))
f.close()
self.scheduler.passed += 1 self.scheduler.passed += 1
if verbose: self.logger.info(self.logstr(True))
print >>sys.stderr, ('[PASS=%d FAIL=%d] ' info(self.infostr(True))
'PASSED %s tsize=%d csize=%d '
'ptquery=%d update=%d' % def print_failure(self):
(self.scheduler.passed, self.scheduler.failed,
self.execf, self.tsize, self.csize,
self.ptquery, self.update))
def print_failure(self, savedir):
f = open(self.logfile, 'w+')
print >>f, ('"%s",%s,%d,%d,%d,%d,%s,%s,%s,PASS' %
(self.execf,
self.rev,
self.tsize,
self.csize,
self.ptquery,
self.update,
time.ctime(self.times[0]),
time.ctime(self.times[1]),
time.ctime(self.times[2])))
f.close()
self.scheduler.failed += 1 self.scheduler.failed += 1
print >>sys.stderr, ('[PASS=%d FAIL=%d] ' self.logger.warning(self.logstr(False))
'FAILED %s tsize=%d csize=%d ' warning(self.infostr(False))
'ptquery=%d update=%d' %
(self.scheduler.passed, self.scheduler.failed,
self.execf, self.tsize, self.csize,
self.ptquery, self.update))
print >>sys.stderr, 'Saved environment to %s' % savedir
def waitfor(self, proc): def waitfor(self, proc):
while proc.poll() is None: while proc.poll() is None:
...@@ -262,26 +253,24 @@ class Worker(Thread): ...@@ -262,26 +253,24 @@ class Worker(Thread):
self.scheduler = scheduler self.scheduler = scheduler
def run(self): def run(self):
dbg('%s starting.' % self) debug('%s starting.' % self)
while not self.scheduler.stopping.isSet(): while not self.scheduler.stopping.isSet():
test_runner = self.scheduler.get() test_runner = self.scheduler.get()
try: try:
test_runner.run() test_runner.run()
except Exception, e: except Exception, e:
print >>sys.stderr, 'Fatal error in worker thread.' exception('Fatal error in worker thread.')
traceback.print_exc() info('Killing all workers.')
print >>sys.stderr, 'Killing all workers.'
self.scheduler.error = e self.scheduler.error = e
self.scheduler.stop() self.scheduler.stop()
if not self.scheduler.stopping.isSet(): if not self.scheduler.stopping.isSet():
self.scheduler.put(test_runner) self.scheduler.put(test_runner)
dbg('%s exiting.' % self) debug('%s exiting.' % self)
class Scheduler(Queue): class Scheduler(Queue):
def __init__(self, nworkers): def __init__(self, nworkers):
Queue.__init__(self) Queue.__init__(self)
if verbose: info('Initializing scheduler with %d jobs.', nworkers)
print >>sys.stderr, 'Initializing scheduler with %d jobs.' % nworkers
self.nworkers = nworkers self.nworkers = nworkers
self.passed = 0 self.passed = 0
self.failed = 0 self.failed = 0
...@@ -291,8 +280,7 @@ class Scheduler(Queue): ...@@ -291,8 +280,7 @@ class Scheduler(Queue):
self.error = None self.error = None
def run(self, timeout): def run(self, timeout):
if verbose: info('Starting workers.')
print >>sys.stderr, 'Starting workers.'
self.stopping.clear() self.stopping.clear()
for i in range(self.nworkers): for i in range(self.nworkers):
w = Worker(self) w = Worker(self)
...@@ -308,10 +296,12 @@ class Scheduler(Queue): ...@@ -308,10 +296,12 @@ class Scheduler(Queue):
break break
w.join(timeout=1.0) w.join(timeout=1.0)
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
debug('Scheduler interrupted. Stopping and joining threads.')
self.stop() self.stop()
self.join() self.join()
sys.exit(0) sys.exit(0)
else: else:
debug('Scheduler stopped by someone else. Joining threads.')
self.join() self.join()
def join(self): def join(self):
...@@ -321,33 +311,45 @@ class Scheduler(Queue): ...@@ -321,33 +311,45 @@ class Scheduler(Queue):
self.workers.pop().join() self.workers.pop().join()
def stop(self): def stop(self):
if verbose: info('Stopping workers.')
print >>sys.stderr, 'Stopping workers.'
self.stopping.set() self.stopping.set()
def compiler_works(cc):
try:
devnull = open(os.devnull, 'w')
r = call([cc, '-v'], stdout=devnull, stderr=STDOUT)
devnull.close()
return r == 0
except OSError:
exception('Error running %s.', cc)
return False
def rebuild(tokudb, cc): def rebuild(tokudb, cc):
env = os.environ env = os.environ
env['CC'] = cc env['CC'] = cc
env['DEBUG'] = '0' env['DEBUG'] = '0'
if verbose: info('Updating from svn.')
print >>sys.stderr, 'Updating from svn.' devnull = open(os.devnull, 'w')
call(['svn', 'up'], cwd=tokudb) call(['svn', 'up'], stdout=devnull, stderr=STDOUT, cwd=tokudb)
if verbose: devnull.close()
print >>sys.stderr, 'Building tokudb.' if not compiler_works(cc):
error('Cannot find working compiler named "%s". Try sourcing the icc env script or providing another compiler with --cc.', cc)
sys.exit(r)
info('Building tokudb.')
r = call(['make', '-s', 'clean'], r = call(['make', '-s', 'clean'],
cwd=tokudb, env=env) cwd=tokudb, env=env)
if r != 0: if r != 0:
print >>sys.stderr, 'Build failed. Do you need to source the icc env script?' error('Cleaning the source tree failed.')
sys.exit(r) sys.exit(r)
r = call(['make', 'fastbuild'], r = call(['make', '-s', 'fastbuild'],
cwd=tokudb, env=env) cwd=tokudb, env=env)
if r != 0: if r != 0:
print >>sys.stderr, 'Build failed. Do you need to source the icc env script?' error('Building the fractal tree failed.')
sys.exit(r) sys.exit(r)
r = call(['make'] + testnames + recover_testnames, r = call(['make', '-s'] + testnames + recover_testnames,
cwd=os.path.join(tokudb, 'src', 'tests'), env=env) cwd=os.path.join(tokudb, 'src', 'tests'), env=env)
if r != 0: if r != 0:
print >>sys.stderr, 'Build failed. Do you need to source the icc env script?' error('Building the tests failed.')
sys.exit(r) sys.exit(r)
def revfor(tokudb): def revfor(tokudb):
...@@ -355,8 +357,7 @@ def revfor(tokudb): ...@@ -355,8 +357,7 @@ def revfor(tokudb):
shell=True, cwd=tokudb, stdout=PIPE) shell=True, cwd=tokudb, stdout=PIPE)
(out, err) = proc.communicate() (out, err) = proc.communicate()
rev = out.strip() rev = out.strip()
if verbose: info('Using tokudb at r%s.', rev)
print >>sys.stderr, 'Using tokudb at r%s.' % rev
return rev return rev
def main(opts): def main(opts):
...@@ -367,9 +368,8 @@ def main(opts): ...@@ -367,9 +368,8 @@ def main(opts):
os.mkdir(opts.log) os.mkdir(opts.log)
if not os.path.exists(opts.savedir): if not os.path.exists(opts.savedir):
os.mkdir(opts.savedir) os.mkdir(opts.savedir)
if verbose: info('Saving pass/fail logs to %s.', opts.log)
print >>sys.stderr, 'Saving pass/fail logs to %s.' % opts.log info('Saving failure environments to %s.', opts.savedir)
print >>sys.stderr, 'Saving failure environments to %s.' % opts.savedir
scheduler = Scheduler(opts.jobs) scheduler = Scheduler(opts.jobs)
...@@ -392,6 +392,7 @@ def main(opts): ...@@ -392,6 +392,7 @@ def main(opts):
while scheduler.error is None: while scheduler.error is None:
scheduler.run(opts.rebuild_period) scheduler.run(opts.rebuild_period)
if scheduler.error is not None: if scheduler.error is not None:
error('Scheduler reported an error.')
raise scheduler.error raise scheduler.error
rebuild(opts.tokudb, opts.cc) rebuild(opts.tokudb, opts.cc)
rev = revfor(opts.tokudb) rev = revfor(opts.tokudb)
...@@ -399,6 +400,9 @@ def main(opts): ...@@ -399,6 +400,9 @@ def main(opts):
runner.rev = rev runner.rev = rev
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
sys.exit(0) sys.exit(0)
except Exception, e:
exception('Unhandled exception caught in main.')
raise e
# relpath implementation for python <2.6 # relpath implementation for python <2.6
# from http://unittest-ext.googlecode.com/hg-history/1df911640f7be239e58fb185b06ac2a8489dcdc4/unittest2/unittest2/compatibility.py # from http://unittest-ext.googlecode.com/hg-history/1df911640f7be239e58fb185b06ac2a8489dcdc4/unittest2/unittest2/compatibility.py
...@@ -486,6 +490,12 @@ if __name__ == '__main__': ...@@ -486,6 +490,12 @@ if __name__ == '__main__':
if len(args) > 0: if len(args) > 0:
parser.print_usage() parser.print_usage()
sys.exit(1) sys.exit(1)
debug = opts.debug
verbose = opts.verbose or opts.debug if opts.debug:
logging.basicConfig(level=logging.DEBUG)
elif opts.verbose:
logging.basicConfig(level=logging.INFO)
else:
logging.basicConfig(level=logging.WARNING)
main(opts) main(opts)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment