tests: add options for debugging topotest failures

Signed-off-by: Christian Hopps <chopps@labn.net>
This commit is contained in:
Christian Hopps 2021-03-03 21:56:46 -05:00
parent 97ec501ef0
commit 3f950192fe
3 changed files with 328 additions and 75 deletions

View File

@ -232,6 +232,85 @@ for ``master`` branch:
and create ``frr`` user and ``frrvty`` group as shown above. and create ``frr`` user and ``frrvty`` group as shown above.
Debugging Topotest Failures
^^^^^^^^^^^^^^^^^^^^^^^^^^^
For the below debugging options which launch programs, if the topotest is run
within screen_ or tmux_, ``gdb``, the shell or ``vtysh`` will be launched using
that windowing program, otherwise mininet's ``xterm`` functionality will be used
to launch the given program.
If you wish to force the use of ``xterm`` rather than ``tmux`` or ``screen``, or
wish to use ``gnome-terminal`` instead of ``xterm``, set the environment
variable ``FRR_TOPO_TERMINAL`` to either ``xterm`` or ``gnome-terminal``.
.. _screen: https://www.gnu.org/software/screen/
.. _tmux: https://github.com/tmux/tmux/wiki
Spawning ``vtysh`` or Shells on Routers
"""""""""""""""""""""""""""""""""""""""
Topotest can automatically launch a shell or ``vtysh`` for any or all routers in
a test. This is enabled by specifying 1 of 2 CLI arguments ``--shell`` or
``--vtysh``. Both of these options can be set to a single router value, multiple
comma-seperated values, or ``all``.
When either of these options are specified topotest will pause after each test
to allow for inspection of the router state.
Here's an example of launching ``vtysh`` on routers ``rt1`` and ``rt2``.
.. code:: shell
pytest --vtysh=rt1,rt2 all-protocol-startup
Spawning Mininet CLI, ``vtysh`` or Shells on Routers on Test Failure
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Similar to the previous section one can have ``vtysh`` or a shell launched on
routers, but in this case only when a test fails. To launch the given process on
each router after a test failure specify one of ``--shell-on-error`` or
``--vtysh-on-error``.
Here's an example of having ``vtysh`` launched on test failure.
.. code:: shell
pytest --vtysh-on-error all-protocol-startup
Additionally, one can have the mininet CLI invoked on test failures by
specifying the ``--mininet-on-error`` CLI option as shown in the example below.
.. code:: shell
pytest --mininet-on-error all-protocol-startup
Debugging with GDB
""""""""""""""""""
Topotest can automatically launch any daemon with ``gdb``, possibly setting
breakpoints for any test run. This is enabled by specifying 1 or 2 CLI arguments
``--gdb-routers`` and ``--gdb-daemons``. Additionally ``--gdb-breakpoints`` can
be used to automatically set breakpoints in the launched ``gdb`` processes.
Each of these options can be set to a single value, multiple comma-seperated
values, or ``all``. If ``--gdb-routers`` is empty but ``--gdb_daemons`` is set
then the given daemons will be launched in ``gdb`` on all routers in the test.
Likewise if ``--gdb_routers`` is set, but ``--gdb_daemons`` is empty then all
daemons on the given routers will be launched in ``gdb``.
Here's an example of launching ``zebra`` and ``bgpd`` inside ``gdb`` on router
``r1`` with a breakpoint set on ``nb_config_diff``
.. code:: shell
pytest --gdb-routers=r1 \
--gdb-daemons=bgpd,zebra \
--gdb-breakpoints=nb_config_diff \
all-protocol-startup
.. _topotests_docker: .. _topotests_docker:
Running Tests with Docker Running Tests with Docker

View File

@ -2,34 +2,87 @@
Topotest conftest.py file. Topotest conftest.py file.
""" """
from lib.topogen import get_topogen, diagnose_env import os
from lib.topotest import json_cmp_result import pdb
from lib.topolog import logger
import pytest import pytest
topology_only = False from lib.topogen import get_topogen, diagnose_env
from lib.topotest import json_cmp_result
from lib.topotest import g_extra_config as topotest_extra_config
from lib.topolog import logger
def pytest_addoption(parser): def pytest_addoption(parser):
""" """
Add topology-only option to the topology tester. This option makes pytest Add topology-only option to the topology tester. This option makes pytest
only run the setup_module() to setup the topology without running any tests. only run the setup_module() to setup the topology without running any tests.
""" """
parser.addoption(
"--gdb-breakpoints",
metavar="SYMBOL[,SYMBOL...]",
help="Comma-separated list of functions to set gdb breakpoints on",
)
parser.addoption(
"--gdb-daemons",
metavar="DAEMON[,DAEMON...]",
help="Comma-separated list of daemons to spawn gdb on, or 'all'",
)
parser.addoption(
"--gdb-routers",
metavar="ROUTER[,ROUTER...]",
help="Comma-separated list of routers to spawn gdb on, or 'all'",
)
parser.addoption(
"--mininet-on-error",
action="store_true",
help="Mininet cli on test failure",
)
parser.addoption(
"--pause-after",
action="store_true",
help="Pause after each test",
)
parser.addoption(
"--shell",
metavar="ROUTER[,ROUTER...]",
help="Comma-separated list of routers to spawn shell on, or 'all'",
)
parser.addoption(
"--shell-on-error",
action="store_true",
help="Spawn shell on all routers on test failure",
)
parser.addoption( parser.addoption(
"--topology-only", "--topology-only",
action="store_true", action="store_true",
help="Only set up this topology, don't run tests", help="Only set up this topology, don't run tests",
) )
parser.addoption(
"--vtysh",
metavar="ROUTER[,ROUTER...]",
help="Comma-separated list of routers to spawn vtysh on, or 'all'",
)
parser.addoption(
"--vtysh-on-error",
action="store_true",
help="Spawn vtysh on all routers on test failure",
)
def pytest_runtest_call(): def pytest_runtest_call():
""" """
This function must be run after setup_module(), it does standarized post This function must be run after setup_module(), it does standarized post
setup routines. It is only being used for the 'topology-only' option. setup routines. It is only being used for the 'topology-only' option.
""" """
global topology_only if topotest_extra_config["topology_only"]:
if topology_only:
tgen = get_topogen() tgen = get_topogen()
if tgen is not None: if tgen is not None:
# Allow user to play with the setup. # Allow user to play with the setup.
@ -42,6 +95,8 @@ def pytest_assertrepr_compare(op, left, right):
""" """
Show proper assertion error message for json_cmp results. Show proper assertion error message for json_cmp results.
""" """
del op
json_result = left json_result = left
if not isinstance(json_result, json_cmp_result): if not isinstance(json_result, json_cmp_result):
json_result = right json_result = right
@ -52,43 +107,105 @@ def pytest_assertrepr_compare(op, left, right):
def pytest_configure(config): def pytest_configure(config):
"Assert that the environment is correctly configured." """
Assert that the environment is correctly configured, and get extra config.
global topology_only """
if not diagnose_env(): if not diagnose_env():
pytest.exit("enviroment has errors, please read the logs") pytest.exit("environment has errors, please read the logs")
if config.getoption("--topology-only"): gdb_routers = config.getoption("--gdb-routers")
topology_only = True gdb_routers = gdb_routers.split(",") if gdb_routers else []
topotest_extra_config["gdb_routers"] = gdb_routers
gdb_daemons = config.getoption("--gdb-daemons")
gdb_daemons = gdb_daemons.split(",") if gdb_daemons else []
topotest_extra_config["gdb_daemons"] = gdb_daemons
gdb_breakpoints = config.getoption("--gdb-breakpoints")
gdb_breakpoints = gdb_breakpoints.split(",") if gdb_breakpoints else []
topotest_extra_config["gdb_breakpoints"] = gdb_breakpoints
mincli_on_error = config.getoption("--mininet-on-error")
topotest_extra_config["mininet_on_error"] = mincli_on_error
shell = config.getoption("--shell")
topotest_extra_config["shell"] = shell.split(",") if shell else []
pause_after = config.getoption("--pause-after")
shell_on_error = config.getoption("--shell-on-error")
topotest_extra_config["shell_on_error"] = shell_on_error
vtysh = config.getoption("--vtysh")
topotest_extra_config["vtysh"] = vtysh.split(",") if vtysh else []
vtysh_on_error = config.getoption("--vtysh-on-error")
topotest_extra_config["vtysh_on_error"] = vtysh_on_error
topotest_extra_config["pause_after"] = (
pause_after or shell or vtysh
)
topotest_extra_config["topology_only"] = config.getoption("--topology-only")
def pytest_runtest_makereport(item, call): def pytest_runtest_makereport(item, call):
"Log all assert messages to default logger with error level" "Log all assert messages to default logger with error level"
# Nothing happened # Nothing happened
if call.when == "call":
pause = topotest_extra_config["pause_after"]
else:
pause = False
if call.excinfo is None: if call.excinfo is None:
return error = False
else:
parent = item.parent
modname = parent.module.__name__
parent = item.parent # Treat skips as non errors, don't pause after
modname = parent.module.__name__ if call.excinfo.typename != "AssertionError":
pause = False
# Treat skips as non errors error = False
if call.excinfo.typename != "AssertionError": logger.info(
logger.info( 'assert skipped at "{}/{}": {}'.format(
'assert skipped at "{}/{}": {}'.format( modname, item.name, call.excinfo.value
modname, item.name, call.excinfo.value )
)
else:
error = True
# Handle assert failures
parent._previousfailed = item # pylint: disable=W0212
logger.error(
'assert failed at "{}/{}": {}'.format(modname, item.name, call.excinfo.value)
) )
)
return
# Handle assert failures # (topogen) Set topology error to avoid advancing in the test.
parent._previousfailed = item tgen = get_topogen()
logger.error( if tgen is not None:
'assert failed at "{}/{}": {}'.format(modname, item.name, call.excinfo.value) # This will cause topogen to report error on `routers_have_failure`.
) tgen.set_error("{}/{}".format(modname, item.name))
# (topogen) Set topology error to avoid advancing in the test.
tgen = get_topogen() if error and topotest_extra_config["shell_on_error"]:
if tgen is not None: for router in tgen.routers():
# This will cause topogen to report error on `routers_have_failure`. pause = True
tgen.set_error("{}/{}".format(modname, item.name)) tgen.net[router].runInWindow(os.getenv("SHELL", "bash"))
if error and topotest_extra_config["vtysh_on_error"]:
for router in tgen.routers():
pause = True
tgen.net[router].runInWindow("vtysh")
if error and topotest_extra_config["mininet_on_error"]:
tgen.mininet_cli()
if pause:
try:
user = raw_input('Testing paused, "pdb" to debug, "Enter" to continue: ')
except NameError:
user = input('Testing paused, "pdb" to debug, "Enter" to continue: ')
if user.strip() == "pdb":
pdb.set_trace()

View File

@ -50,7 +50,9 @@ from mininet.node import Node, OVSSwitch, Host
from mininet.log import setLogLevel, info from mininet.log import setLogLevel, info
from mininet.cli import CLI from mininet.cli import CLI
from mininet.link import Intf from mininet.link import Intf
from mininet.term import makeTerm
g_extra_config = {}
def gdb_core(obj, daemon, corefiles): def gdb_core(obj, daemon, corefiles):
gdbcmds = """ gdbcmds = """
@ -1303,6 +1305,37 @@ class Router(Node):
logger.info("No daemon {} known".format(daemon)) logger.info("No daemon {} known".format(daemon))
# print "Daemons after:", self.daemons # print "Daemons after:", self.daemons
# Run a command in a new window (gnome-terminal, screen, tmux, xterm)
def runInWindow(self, cmd, title=None):
topo_terminal = os.getenv("FRR_TOPO_TERMINAL")
if topo_terminal or (
"TMUX" not in os.environ and "STY" not in os.environ
):
term = topo_terminal if topo_terminal else "xterm"
makeTerm(
self,
title=title if title else cmd,
term=term,
cmd=cmd)
else:
nscmd = "sudo nsenter -m -n -t {} {}".format(self.pid, cmd)
if "TMUX" in os.environ:
self.cmd("tmux select-layout main-horizontal")
wcmd = "tmux split-window -h"
cmd = "{} {}".format(wcmd, nscmd)
elif "STY" in os.environ:
if os.path.exists(
"/run/screen/S-{}/{}".format(
os.environ['USER'], os.environ['STY']
)
):
wcmd = "screen"
else:
wcmd = "sudo -u {} screen".format(os.environ["SUDO_USER"])
cmd = "{} {}".format(wcmd, nscmd)
self.cmd(cmd)
def startRouter(self, tgen=None): def startRouter(self, tgen=None):
# Disable integrated-vtysh-config # Disable integrated-vtysh-config
self.cmd( self.cmd(
@ -1355,6 +1388,14 @@ class Router(Node):
return "LDP/MPLS Tests need mpls kernel modules" return "LDP/MPLS Tests need mpls kernel modules"
self.cmd("echo 100000 > /proc/sys/net/mpls/platform_labels") self.cmd("echo 100000 > /proc/sys/net/mpls/platform_labels")
shell_routers = g_extra_config["shell"]
if "all" in shell_routers or self.name in shell_routers:
self.runInWindow(os.getenv("SHELL", "bash"))
vtysh_routers = g_extra_config["vtysh"]
if "all" in vtysh_routers or self.name in vtysh_routers:
self.runInWindow("vtysh")
if self.daemons["eigrpd"] == 1: if self.daemons["eigrpd"] == 1:
eigrpd_path = os.path.join(self.daemondir, "eigrpd") eigrpd_path = os.path.join(self.daemondir, "eigrpd")
if not os.path.isfile(eigrpd_path): if not os.path.isfile(eigrpd_path):
@ -1381,6 +1422,10 @@ class Router(Node):
def startRouterDaemons(self, daemons=None): def startRouterDaemons(self, daemons=None):
"Starts all FRR daemons for this router." "Starts all FRR daemons for this router."
gdb_breakpoints = g_extra_config["gdb_breakpoints"]
gdb_daemons = g_extra_config["gdb_daemons"]
gdb_routers = g_extra_config["gdb_routers"]
bundle_data = "" bundle_data = ""
if os.path.exists("/etc/frr/support_bundle_commands.conf"): if os.path.exists("/etc/frr/support_bundle_commands.conf"):
@ -1410,7 +1455,7 @@ class Router(Node):
# If `daemons` was specified then some upper API called us with # If `daemons` was specified then some upper API called us with
# specific daemons, otherwise just use our own configuration. # specific daemons, otherwise just use our own configuration.
daemons_list = [] daemons_list = []
if daemons != None: if daemons is not None:
daemons_list = daemons daemons_list = daemons
else: else:
# Append all daemons configured. # Append all daemons configured.
@ -1418,47 +1463,67 @@ class Router(Node):
if self.daemons[daemon] == 1: if self.daemons[daemon] == 1:
daemons_list.append(daemon) daemons_list.append(daemon)
def start_daemon(daemon, extra_opts=None):
daemon_opts = self.daemons_options.get(daemon, "")
rediropt = " > {0}.out 2> {0}.err".format(daemon)
if daemon == "snmpd":
binary = "/usr/sbin/snmpd"
cmdenv = ""
cmdopt = "{} -C -c /etc/frr/snmpd.conf -p ".format(
daemon_opts
) + "/var/run/{}/snmpd.pid -x /etc/frr/agentx".format(self.routertype)
else:
binary = os.path.join(self.daemondir, daemon)
cmdenv = "ASAN_OPTIONS=log_path={0}.asan".format(daemon)
cmdopt = "{} --log file:{}.log --log-level debug".format(
daemon_opts, daemon
)
if extra_opts:
cmdopt += " " + extra_opts
if (
(gdb_routers or gdb_daemons)
and (not gdb_routers
or self.name in gdb_routers
or "all" in gdb_routers)
and (not gdb_daemons
or daemon in gdb_daemons
or "all" in gdb_daemons)
):
if daemon == "snmpd":
cmdopt += " -f "
cmdopt += rediropt
gdbcmd = "sudo -E gdb " + binary
if gdb_breakpoints:
gdbcmd += " -ex 'set breakpoint pending on'"
for bp in gdb_breakpoints:
gdbcmd += " -ex 'b {}'".format(bp)
gdbcmd += " -ex 'run {}'".format(cmdopt)
self.runInWindow(gdbcmd, daemon)
else:
if daemon != "snmpd":
cmdopt += " -d "
cmdopt += rediropt
self.cmd(" ".join([cmdenv, binary, cmdopt]))
logger.info("{}: {} {} started".format(self, self.routertype, daemon))
# Start Zebra first # Start Zebra first
if "zebra" in daemons_list: if "zebra" in daemons_list:
zebra_path = os.path.join(self.daemondir, "zebra") start_daemon("zebra", "-s 90000000")
zebra_option = self.daemons_options["zebra"]
self.cmd(
"ASAN_OPTIONS=log_path=zebra.asan {0} {1} --log file:zebra.log --log-level debug -s 90000000 -d > zebra.out 2> zebra.err".format(
zebra_path, zebra_option
)
)
logger.debug("{}: {} zebra started".format(self, self.routertype))
# Remove `zebra` so we don't attempt to start it again.
while "zebra" in daemons_list: while "zebra" in daemons_list:
daemons_list.remove("zebra") daemons_list.remove("zebra")
# Start staticd next if required # Start staticd next if required
if "staticd" in daemons_list: if "staticd" in daemons_list:
staticd_path = os.path.join(self.daemondir, "staticd") start_daemon("staticd")
staticd_option = self.daemons_options["staticd"]
self.cmd(
"ASAN_OPTIONS=log_path=staticd.asan {0} {1} --log file:staticd.log --log-level debug -d > staticd.out 2> staticd.err".format(
staticd_path, staticd_option
)
)
logger.debug("{}: {} staticd started".format(self, self.routertype))
# Remove `staticd` so we don't attempt to start it again.
while "staticd" in daemons_list: while "staticd" in daemons_list:
daemons_list.remove("staticd") daemons_list.remove("staticd")
if "snmpd" in daemons_list: if "snmpd" in daemons_list:
snmpd_path = "/usr/sbin/snmpd" start_daemon("snmpd")
snmpd_option = self.daemons_options["snmpd"]
self.cmd(
"{0} {1} -C -c /etc/frr/snmpd.conf -p /var/run/{2}/snmpd.pid -x /etc/frr/agentx > snmpd.out 2> snmpd.err".format(
snmpd_path, snmpd_option, self.routertype
)
)
logger.info("{}: {} snmpd started".format(self, self.routertype))
# Remove `snmpd` so we don't attempt to start it again.
while "snmpd" in daemons_list: while "snmpd" in daemons_list:
daemons_list.remove("snmpd") daemons_list.remove("snmpd")
@ -1470,17 +1535,9 @@ class Router(Node):
# Now start all the other daemons # Now start all the other daemons
for daemon in daemons_list: for daemon in daemons_list:
# Skip disabled daemons and zebra
if self.daemons[daemon] == 0: if self.daemons[daemon] == 0:
continue continue
start_daemon(daemon)
daemon_path = os.path.join(self.daemondir, daemon)
self.cmd(
"ASAN_OPTIONS=log_path={2}.asan {0} {1} --log file:{2}.log --log-level debug -d > {2}.out 2> {2}.err".format(
daemon_path, self.daemons_options.get(daemon, ""), daemon
)
)
logger.debug("{}: {} {} started".format(self, self.routertype, daemon))
# Check if daemons are running. # Check if daemons are running.
rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype) rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)