topotests: stop wasting time at exit

... and clean up zombie child processes

Signed-off-by: David Lamparter <equinox@opensourcerouting.org>
This commit is contained in:
David Lamparter 2020-07-15 18:48:18 +02:00
parent ba5410e32f
commit f033a78a99

View File

@ -35,6 +35,7 @@ import tempfile
import platform
import difflib
import time
import signal
from lib.topolog import logger
from copy import deepcopy
@ -450,6 +451,10 @@ def pid_exists(pid):
if pid <= 0:
return False
try:
os.waitpid(pid, os.WNOHANG)
except:
pass
try:
os.kill(pid, 0)
except OSError as err:
@ -1021,8 +1026,8 @@ class Router(Node):
os.system("chmod -R go+rw /tmp/topotests")
# Return count of running daemons
def countDaemons(self):
numRunning = 0
def listDaemons(self):
ret = []
rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)
errors = ""
if re.search(r"No such file or directory", rundaemons):
@ -1031,12 +1036,11 @@ class Router(Node):
for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
if daemonpid.isdigit() and pid_exists(int(daemonpid)):
numRunning += 1
return numRunning
ret.append(os.path.basename(d.rstrip().rsplit(".", 1)[0]))
return ret
def stopRouter(self, wait=True, assertOnError=True, minErrorVersion="5.1"):
# Stop Running FRR Daemons
numRunning = 0
rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)
errors = ""
if re.search(r"No such file or directory", rundaemons):
@ -1045,24 +1049,36 @@ class Router(Node):
for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
if daemonpid.isdigit() and pid_exists(int(daemonpid)):
daemonname = os.path.basename(d.rstrip().rsplit(".", 1)[0])
logger.info(
"{}: stopping {}".format(
self.name, os.path.basename(d.rstrip().rsplit(".", 1)[0])
self.name, daemonname
)
)
self.cmd("kill -TERM %s" % daemonpid)
self.waitOutput()
if pid_exists(int(daemonpid)):
numRunning += 1
try:
os.kill(int(daemonpid), signal.SIGTERM)
except OSError as err:
if err.errno == errno.ESRCH:
logger.error("{}: {} left a dead pidfile (pid={})".format(self.name, daemonname, daemonpid))
else:
logger.info("{}: {} could not kill pid {}: {}".format(self.name, daemonname, daemonpid, str(err)))
if wait and numRunning > 0:
counter = 5
while counter > 0 and numRunning > 0:
sleep(2, "{}: waiting for daemons stopping".format(self.name))
numRunning = self.countDaemons()
if not wait:
return errors
running = self.listDaemons()
if running:
sleep(0.1, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running)))
running = self.listDaemons()
counter = 20
while counter > 0 and running:
sleep(0.5, "{}: waiting for daemons stopping: {}".format(self.name, ', '.join(running)))
running = self.listDaemons()
counter -= 1
if wait and numRunning > 0:
if running:
# 2nd round of kill if daemons didn't exit
for d in StringIO.StringIO(rundaemons):
daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
@ -1077,13 +1093,15 @@ class Router(Node):
self.waitOutput()
self.cmd("rm -- {}".format(d.rstrip()))
if wait:
errors = self.checkRouterCores(reportOnce=True)
if self.checkRouterVersion("<", minErrorVersion):
# ignore errors in old versions
errors = ""
if assertOnError and len(errors) > 0:
assert "Errors found - details follow:" == 0, errors
if not wait:
return errors
errors = self.checkRouterCores(reportOnce=True)
if self.checkRouterVersion("<", minErrorVersion):
# ignore errors in old versions
errors = ""
if assertOnError and len(errors) > 0:
assert "Errors found - details follow:" == 0, errors
return errors
def removeIPs(self):