Merge pull request #6746 from opensourcerouting/topotests-wtf

topotests: stop the zombie apocalypse
This commit is contained in:
Donald Sharp 2020-07-15 20:04:07 -04:00 committed by GitHub
commit c71ff7a60e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 77 additions and 52 deletions

View File

@ -19,6 +19,7 @@ RUN export DEBIAN_FRONTEND=noninteractive \
libjson-c-dev \ libjson-c-dev \
libpcre3-dev \ libpcre3-dev \
libpython-dev \ libpython-dev \
libpython3-dev \
libreadline-dev \ libreadline-dev \
libc-ares-dev \ libc-ares-dev \
libcap-dev \ libcap-dev \
@ -26,7 +27,10 @@ RUN export DEBIAN_FRONTEND=noninteractive \
mininet \ mininet \
pkg-config \ pkg-config \
python-pip \ python-pip \
python-sphinx \ python3 \
python3-dev \
python3-sphinx \
python3-pytest \
rsync \ rsync \
strace \ strace \
tcpdump \ tcpdump \

View File

@ -35,6 +35,7 @@ import tempfile
import platform import platform
import difflib import difflib
import time import time
import signal
from lib.topolog import logger from lib.topolog import logger
from copy import deepcopy from copy import deepcopy
@ -51,6 +52,35 @@ from mininet.log import setLogLevel, info
from mininet.cli import CLI from mininet.cli import CLI
from mininet.link import Intf from mininet.link import Intf
def gdb_core(obj, daemon, corefiles):
gdbcmds = '''
info threads
bt full
disassemble
up
disassemble
up
disassemble
up
disassemble
up
disassemble
up
disassemble
'''
gdbcmds = [['-ex', i.strip()] for i in gdbcmds.strip().split('\n')]
gdbcmds = [item for sl in gdbcmds for item in sl]
daemon_path = os.path.join(obj.daemondir, daemon)
backtrace = subprocess.check_output(
['gdb', daemon_path, corefiles[0], '--batch'] + gdbcmds
)
sys.stderr.write(
"\n%s: %s crashed. Core file found - Backtrace follows:\n"
% (obj.name, daemon)
)
sys.stderr.write("%s" % backtrace)
return backtrace
class json_cmp_result(object): class json_cmp_result(object):
"json_cmp result class for better assertion messages" "json_cmp result class for better assertion messages"
@ -421,6 +451,10 @@ def pid_exists(pid):
if pid <= 0: if pid <= 0:
return False return False
try:
os.waitpid(pid, os.WNOHANG)
except:
pass
try: try:
os.kill(pid, 0) os.kill(pid, 0)
except OSError as err: except OSError as err:
@ -992,8 +1026,8 @@ class Router(Node):
os.system("chmod -R go+rw /tmp/topotests") os.system("chmod -R go+rw /tmp/topotests")
# Return count of running daemons # Return count of running daemons
def countDaemons(self): def listDaemons(self):
numRunning = 0 ret = []
rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype) rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)
errors = "" errors = ""
if re.search(r"No such file or directory", rundaemons): if re.search(r"No such file or directory", rundaemons):
@ -1002,12 +1036,11 @@ class Router(Node):
for d in StringIO.StringIO(rundaemons): for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
if daemonpid.isdigit() and pid_exists(int(daemonpid)): if daemonpid.isdigit() and pid_exists(int(daemonpid)):
numRunning += 1 ret.append(os.path.basename(d.rstrip().rsplit(".", 1)[0]))
return numRunning return ret
def stopRouter(self, wait=True, assertOnError=True, minErrorVersion="5.1"): def stopRouter(self, wait=True, assertOnError=True, minErrorVersion="5.1"):
# Stop Running FRR Daemons # Stop Running FRR Daemons
numRunning = 0
rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype) rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)
errors = "" errors = ""
if re.search(r"No such file or directory", rundaemons): if re.search(r"No such file or directory", rundaemons):
@ -1016,24 +1049,36 @@ class Router(Node):
for d in StringIO.StringIO(rundaemons): for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
if daemonpid.isdigit() and pid_exists(int(daemonpid)): if daemonpid.isdigit() and pid_exists(int(daemonpid)):
daemonname = os.path.basename(d.rstrip().rsplit(".", 1)[0])
logger.info( logger.info(
"{}: stopping {}".format( "{}: stopping {}".format(
self.name, os.path.basename(d.rstrip().rsplit(".", 1)[0]) self.name, daemonname
) )
) )
self.cmd("kill -TERM %s" % daemonpid) try:
self.waitOutput() os.kill(int(daemonpid), signal.SIGTERM)
if pid_exists(int(daemonpid)): except OSError as err:
numRunning += 1 if err.errno == errno.ESRCH:
logger.error("{}: {} left a dead pidfile (pid={})".format(self.name, daemonname, daemonpid))
else:
logger.info("{}: {} could not kill pid {}: {}".format(self.name, daemonname, daemonpid, str(err)))
if wait and numRunning > 0: if not wait:
counter = 5 return errors
while counter > 0 and numRunning > 0:
sleep(2, "{}: waiting for daemons stopping".format(self.name)) running = self.listDaemons()
numRunning = self.countDaemons()
if running:
sleep(0.1, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running)))
running = self.listDaemons()
counter = 20
while counter > 0 and running:
sleep(0.5, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running)))
running = self.listDaemons()
counter -= 1 counter -= 1
if wait and numRunning > 0: if running:
# 2nd round of kill if daemons didn't exit # 2nd round of kill if daemons didn't exit
for d in StringIO.StringIO(rundaemons): for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip() daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
@ -1048,7 +1093,9 @@ class Router(Node):
self.waitOutput() self.waitOutput()
self.cmd("rm -- {}".format(d.rstrip())) self.cmd("rm -- {}".format(d.rstrip()))
if wait: if not wait:
return errors
errors = self.checkRouterCores(reportOnce=True) errors = self.checkRouterCores(reportOnce=True)
if self.checkRouterVersion("<", minErrorVersion): if self.checkRouterVersion("<", minErrorVersion):
# ignore errors in old versions # ignore errors in old versions
@ -1348,20 +1395,7 @@ class Router(Node):
"{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon) "{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon)
) )
if len(corefiles) > 0: if len(corefiles) > 0:
daemon_path = os.path.join(self.daemondir, daemon) backtrace = gdb_core(self, daemon, corefiles)
backtrace = subprocess.check_output(
[
"gdb {} {} --batch -ex bt 2> /dev/null".format(
daemon_path, corefiles[0]
)
],
shell=True,
)
sys.stderr.write(
"\n%s: %s crashed. Core file found - Backtrace follows:\n"
% (self.name, daemon)
)
sys.stderr.write("%s" % backtrace)
traces = ( traces = (
traces traces
+ "\n%s: %s crashed. Core file found - Backtrace follows:\n%s" + "\n%s: %s crashed. Core file found - Backtrace follows:\n%s"
@ -1431,20 +1465,7 @@ class Router(Node):
"{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon) "{}/{}/{}_core*.dmp".format(self.logdir, self.name, daemon)
) )
if len(corefiles) > 0: if len(corefiles) > 0:
daemon_path = os.path.join(self.daemondir, daemon) gdb_core(self, daemon, corefiles)
backtrace = subprocess.check_output(
[
"gdb {} {} --batch -ex bt 2> /dev/null".format(
daemon_path, corefiles[0]
)
],
shell=True,
)
sys.stderr.write(
"\n%s: %s crashed. Core file found - Backtrace follows:\n"
% (self.name, daemon)
)
sys.stderr.write("%s\n" % backtrace)
else: else:
# No core found - If we find matching logfile in /tmp, then print last 20 lines from it. # No core found - If we find matching logfile in /tmp, then print last 20 lines from it.
if os.path.isfile( if os.path.isfile(