From 79f6fdeb3f17285f6611f21525fef2e6e6b81ace Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Tue, 14 Jul 2020 23:30:28 +0200 Subject: [PATCH 1/3] topotests: unshittify backtraces add thread info, use "bt full" to get variables and add a bit of disassembly for good measure. Signed-off-by: David Lamparter --- tests/topotests/lib/topotest.py | 59 +++++++++++++++++---------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/tests/topotests/lib/topotest.py b/tests/topotests/lib/topotest.py index bffb8208e7..cb40feeeb8 100644 --- a/tests/topotests/lib/topotest.py +++ b/tests/topotests/lib/topotest.py @@ -51,6 +51,35 @@ from mininet.log import setLogLevel, info from mininet.cli import CLI from mininet.link import Intf +def gdb_core(obj, daemon, corefiles): + gdbcmds = ''' + info threads + bt full + disassemble + up + disassemble + up + disassemble + up + disassemble + up + disassemble + up + disassemble + ''' + gdbcmds = [['-ex', i.strip()] for i in gdbcmds.strip().split('\n')] + gdbcmds = [item for sl in gdbcmds for item in sl] + + daemon_path = os.path.join(obj.daemondir, daemon) + backtrace = subprocess.check_output( + ['gdb', daemon_path, corefiles[0], '--batch'] + gdbcmds + ) + sys.stderr.write( + "\n%s: %s crashed. Core file found - Backtrace follows:\n" + % (obj.name, daemon) + ) + sys.stderr.write("%s" % backtrace) + return backtrace class json_cmp_result(object): "json_cmp result class for better assertion messages" @@ -1348,20 +1377,7 @@ class Router(Node): "{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon) ) if len(corefiles) > 0: - daemon_path = os.path.join(self.daemondir, daemon) - backtrace = subprocess.check_output( - [ - "gdb {} {} --batch -ex bt 2> /dev/null".format( - daemon_path, corefiles[0] - ) - ], - shell=True, - ) - sys.stderr.write( - "\n%s: %s crashed. Core file found - Backtrace follows:\n" - % (self.name, daemon) - ) - sys.stderr.write("%s" % backtrace) + backtrace = gdb_core(self, daemon, corefiles) traces = ( traces + "\n%s: %s crashed. Core file found - Backtrace follows:\n%s" @@ -1431,20 +1447,7 @@ class Router(Node): "{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon) ) if len(corefiles) > 0: - daemon_path = os.path.join(self.daemondir, daemon) - backtrace = subprocess.check_output( - [ - "gdb {} {} --batch -ex bt 2> /dev/null".format( - daemon_path, corefiles[0] - ) - ], - shell=True, - ) - sys.stderr.write( - "\n%s: %s crashed. Core file found - Backtrace follows:\n" - % (self.name, daemon) - ) - sys.stderr.write("%s\n" % backtrace) + gdb_core(self, daemon, corefiles) else: # No core found - If we find matching logfile in /tmp, then print last 20 lines from it. if os.path.isfile( From ba5410e32f689e26b14b5eb9b6e6e01bcb66a2d1 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Wed, 15 Jul 2020 18:41:07 +0200 Subject: [PATCH 2/3] topotests: update Dockerfile ... python3 is non-optional now. Signed-off-by: David Lamparter --- tests/topotests/Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/topotests/Dockerfile b/tests/topotests/Dockerfile index cdd0ae2f6e..b7c6298228 100644 --- a/tests/topotests/Dockerfile +++ b/tests/topotests/Dockerfile @@ -19,6 +19,7 @@ RUN export DEBIAN_FRONTEND=noninteractive \ libjson-c-dev \ libpcre3-dev \ libpython-dev \ + libpython3-dev \ libreadline-dev \ libc-ares-dev \ libcap-dev \ @@ -26,7 +27,10 @@ RUN export DEBIAN_FRONTEND=noninteractive \ mininet \ pkg-config \ python-pip \ - python-sphinx \ + python3 \ + python3-dev \ + python3-sphinx \ + python3-pytest \ rsync \ strace \ tcpdump \ From f033a78a99482b18f1b4c80cfaafcc0aebef1990 Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Wed, 15 Jul 2020 18:48:18 +0200 Subject: [PATCH 3/3] topotests: stop wasting time at exit ... and clean up zombie child processes Signed-off-by: David Lamparter --- tests/topotests/lib/topotest.py | 64 +++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/tests/topotests/lib/topotest.py b/tests/topotests/lib/topotest.py index cb40feeeb8..b5fa2ea59b 100644 --- a/tests/topotests/lib/topotest.py +++ b/tests/topotests/lib/topotest.py @@ -35,6 +35,7 @@ import tempfile import platform import difflib import time +import signal from lib.topolog import logger from copy import deepcopy @@ -450,6 +451,10 @@ def pid_exists(pid): if pid <= 0: return False + try: + os.waitpid(pid, os.WNOHANG) + except: + pass try: os.kill(pid, 0) except OSError as err: @@ -1021,8 +1026,8 @@ class Router(Node): os.system("chmod -R go+rw /tmp/topotests") # Return count of running daemons - def countDaemons(self): - numRunning = 0 + def listDaemons(self): + ret = [] rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype) errors = "" if re.search(r"No such file or directory", rundaemons): @@ -1031,12 +1036,11 @@ class Router(Node): for d in StringIO.StringIO(rundaemons): daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() if daemonpid.isdigit() and pid_exists(int(daemonpid)): - numRunning += 1 - return numRunning + ret.append(os.path.basename(d.rstrip().rsplit(".", 1)[0])) + return ret def stopRouter(self, wait=True, assertOnError=True, minErrorVersion="5.1"): # Stop Running FRR Daemons - numRunning = 0 rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype) errors = "" if re.search(r"No such file or directory", rundaemons): @@ -1045,24 +1049,36 @@ class Router(Node): for d in StringIO.StringIO(rundaemons): daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() if daemonpid.isdigit() and pid_exists(int(daemonpid)): + daemonname = os.path.basename(d.rstrip().rsplit(".", 1)[0]) logger.info( "{}: stopping {}".format( - self.name, os.path.basename(d.rstrip().rsplit(".", 1)[0]) + self.name, daemonname ) ) - self.cmd("kill -TERM %s" % daemonpid) - self.waitOutput() - if pid_exists(int(daemonpid)): - numRunning += 1 + try: + os.kill(int(daemonpid), signal.SIGTERM) + except OSError as err: + if err.errno == errno.ESRCH: + logger.error("{}: {} left a dead pidfile (pid={})".format(self.name, daemonname, daemonpid)) + else: + logger.info("{}: {} could not kill pid {}: {}".format(self.name, daemonname, daemonpid, str(err))) - if wait and numRunning > 0: - counter = 5 - while counter > 0 and numRunning > 0: - sleep(2, "{}: waiting for daemons stopping".format(self.name)) - numRunning = self.countDaemons() + if not wait: + return errors + + running = self.listDaemons() + + if running: + sleep(0.1, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running))) + running = self.listDaemons() + + counter = 20 + while counter > 0 and running: + sleep(0.5, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running))) + running = self.listDaemons() counter -= 1 - if wait and numRunning > 0: + if running: # 2nd round of kill if daemons didn't exit for d in StringIO.StringIO(rundaemons): daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() @@ -1077,13 +1093,15 @@ class Router(Node): self.waitOutput() self.cmd("rm -- {}".format(d.rstrip())) - if wait: - errors = self.checkRouterCores(reportOnce=True) - if self.checkRouterVersion("<", minErrorVersion): - # ignore errors in old versions - errors = "" - if assertOnError and len(errors) > 0: - assert "Errors found - details follow:" == 0, errors + if not wait: + return errors + + errors = self.checkRouterCores(reportOnce=True) + if self.checkRouterVersion("<", minErrorVersion): + # ignore errors in old versions + errors = "" + if assertOnError and len(errors) > 0: + assert "Errors found - details follow:" == 0, errors return errors def removeIPs(self):