Merge pull request #13332 from LabNConsulting/chopps/new-opts

2025-08-06 16:20:08 +00:00 · 2023-04-22 19:51:05 -04:00 · 2023-04-22 19:51:05 -04:00 · 2da3174115
commit 2da3174115
parent f662a4194d c45ef001ab
8 changed files with 370 additions and 241 deletions
--- a/doc/developer/topotests.rst
+++ b/doc/developer/topotests.rst
@ -402,6 +402,63 @@ environment.
 .. _screen: https://www.gnu.org/software/screen/
 .. _tmux: https://github.com/tmux/tmux/wiki

+Capturing Packets
+"""""""""""""""""
+
+One can view and capture packets on any of the networks or interfaces defined by
+the topotest by specifying the ``--pcap=NET|INTF|all[,NET|INTF,...]`` CLI option
+as shown in the examples below.
+
+.. code:: shell
+
+   # Capture on all networks in isis_topo1 test
+   sudo -E pytest isis_topo1 --pcap=all
+
+   # Capture on `sw1` network
+   sudo -E pytest isis_topo1 --pcap=sw1
+
+   # Capture on `sw1` network and on interface `eth0` on router `r2`
+   sudo -E pytest isis_topo1 --pcap=sw1,r2:r2-eth0
+
+For each capture a window is opened displaying a live summary of the captured
+packets. Additionally, the entire packet stream is captured in a pcap file in
+the tests log directory e.g.,::
+
+.. code:: console
+
+   $ sudo -E pytest isis_topo1 --pcap=sw1,r2:r2-eth0
+   ...
+   $ ls -l /tmp/topotests/isis_topo1.test_isis_topo1/
+   -rw------- 1 root root 45172 Apr 19 05:30 capture-r2-r2-eth0.pcap
+   -rw------- 1 root root 48412 Apr 19 05:30 capture-sw1.pcap
+   ...
+-
+Viewing Live Daemon Logs
+""""""""""""""""""""""""
+
+One can live view daemon or the frr logs in separate windows using the
+``--logd`` CLI option as shown below.
+
+.. code:: shell
+
+   # View `ripd` logs on all routers in test
+   sudo -E pytest rip_allow_ecmp --logd=ripd
+
+   # View `ripd` logs on all routers and `mgmtd` log on `r1`
+   sudo -E pytest rip_allow_ecmp --logd=ripd --logd=mgmtd,r1
+
+For each capture a window is opened displaying a live summary of the captured
+packets. Additionally, the entire packet stream is captured in a pcap file in
+the tests log directory e.g.,::
+
+When using a unified log file `frr.log` one substitutes `frr` for the daemon
+name in the ``--logd`` CLI option, e.g.,
+
+.. code:: shell
+
+   # View `frr` log on all routers in test
+   sudo -E pytest some_test_suite --logd=frr
+
 Spawning Debugging CLI, ``vtysh`` or Shells on Routers on Test Failure
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

@ -421,12 +478,30 @@ the help command from within a CLI launched on error:

    test_bgp_multiview_topo1/test_bgp_routingTable> help

-    Commands:
-    help                       :: this help
-    sh [hosts] <shell-command> :: execute <shell-command> on <host>
-    term [hosts]               :: open shell terminals for hosts
-    vtysh [hosts]              :: open vtysh terminals for hosts
-    [hosts] <vtysh-command>    :: execute vtysh-command on hosts
+    Basic Commands:
+      cli   :: open a secondary CLI window
+      help  :: this help
+      hosts :: list hosts
+      quit  :: quit the cli
+
+      HOST can be a host or one of the following:
+        - '*' for all hosts
+        - '.' for the parent munet
+        - a regex specified between '/' (e.g., '/rtr.*/')
+
+    New Window Commands:
+      logd HOST [HOST ...] DAEMON   :: tail -f on the logfile of the given DAEMON for the given HOST[S]
+      pcap NETWORK  :: capture packets from NETWORK into file capture-NETWORK.pcap the command is run within a new window which also shows packet summaries. NETWORK can also be an interface specified as HOST:INTF. To capture inside the host namespace.
+      stderr HOST [HOST ...] DAEMON :: tail -f on the stderr of the given DAEMON for the given HOST[S]
+      stdlog HOST [HOST ...]        :: tail -f on the `frr.log` for the given HOST[S]
+      stdout HOST [HOST ...] DAEMON :: tail -f on the stdout of the given DAEMON for the given HOST[S]
+      term HOST [HOST ...]  :: open terminal[s] (TMUX or XTerm) on HOST[S], * for all
+      vtysh ROUTER [ROUTER ...]     ::
+      xterm HOST [HOST ...] :: open XTerm[s] on HOST[S], * for all
+    Inline Commands:
+      [ROUTER ...] COMMAND  :: execute vtysh COMMAND on the router[s]
+      [HOST ...] sh <SHELL-COMMAND> :: execute <SHELL-COMMAND> on hosts
+      [HOST ...] shi <INTERACTIVE-COMMAND>  :: execute <INTERACTIVE-COMMAND> on HOST[s]

    test_bgp_multiview_topo1/test_bgp_routingTable> r1 show int br
    ------ Host: r1 ------
--- a/tests/topotests/bgp_gr_functionality_topo1/test_bgp_gr_functionality_topo1-3.py
+++ b/tests/topotests/bgp_gr_functionality_topo1/test_bgp_gr_functionality_topo1-3.py
@ -1159,7 +1159,7 @@ def test_BGP_GR_TC_31_2_p1(request):
    reset_config_on_routers(tgen)

    logger.info(
-        "[Phase 1] : Test Setup " "[Disable Mode]R1-----R2[Restart Mode] initialized  "
+        "[Phase 1] : Test Setup " "[Disable Mode]R1-----R2[Helper Mode] initialized  "
    )

    # Configure graceful-restart
@ -1251,7 +1251,7 @@ def test_BGP_GR_TC_31_2_p1(request):
            tc_name, result
        )

-    logger.info("[Phase 2] : R2 Goes from Disable to Restart Mode  ")
+    logger.info("[Phase 2] : R1 Goes from Disable to Restart Mode  ")

    # Configure graceful-restart
    input_dict = {
@ -1356,31 +1356,7 @@ def test_BGP_GR_TC_31_2_p1(request):
        },
    }

-    # here the verify_graceful_restart fro the neighbor would be
-    # "NotReceived" as the latest GR config is not yet applied.
-    for addr_type in ADDR_TYPES:
-        result = verify_graceful_restart(
-            tgen, topo, addr_type, input_dict, dut="r1", peer="r2"
-        )
-        assert result is True, "Testcase {} : Failed \n Error {}".format(
-            tc_name, result
-        )
-
-    for addr_type in ADDR_TYPES:
-        # Verifying RIB routes
-        next_hop = next_hop_per_address_family(
-            tgen, dut, peer, addr_type, NEXT_HOP_IP_2
-        )
-        input_topo = {key: topo["routers"][key] for key in ["r2"]}
-        result = verify_rib(tgen, addr_type, dut, input_topo, next_hop, protocol)
-        assert result is True, "Testcase {} : Failed \n Error {}".format(
-            tc_name, result
-        )
-
-    logger.info("[Phase 6] : R1 is about to come up now  ")
-    start_router_daemons(tgen, "r1", ["bgpd"])
-
-    logger.info("[Phase 4] : R1 is UP now, so time to collect GR stats ")
+    logger.info("[Phase 4] : R1 is UP and GR state is correct ")

    for addr_type in ADDR_TYPES:
        result = verify_graceful_restart(
--- a/tests/topotests/conftest.py
+++ b/tests/topotests/conftest.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 eval: (blacken-mode 1) -*-
 """
 Topotest conftest.py file.
 """
@ -5,25 +6,24 @@ Topotest conftest.py file.

 import glob
 import os
-import pdb
 import re
 import resource
 import subprocess
 import sys
 import time

-import pytest
-
 import lib.fixtures
-from lib import topolog
-from lib.micronet_compat import Mininet
+import pytest
+from lib.micronet_compat import ConfigOptionsProxy, Mininet
 from lib.topogen import diagnose_env, get_topogen
 from lib.topolog import logger
-from lib.topotest import g_extra_config as topotest_extra_config
 from lib.topotest import json_cmp_result
+from munet import cli
 from munet.base import Commander, proc_error
 from munet.cleanup import cleanup_current, cleanup_previous
-from munet import cli
+from munet.testing.util import pause_test
+
+from lib import topolog, topotest


 def pytest_addoption(parser):
@ -61,12 +61,28 @@ def pytest_addoption(parser):
        help="Comma-separated list of routers to spawn gdb on, or 'all'",
    )

+    parser.addoption(
+        "--logd",
+        action="append",
+        metavar="DAEMON[,ROUTER[,...]",
+        help=(
+            "Tail-F of DAEMON log file. Specify routers in comma-separated list after "
+            "daemon to limit to a subset of routers"
+        ),
+    )
+
    parser.addoption(
        "--pause",
        action="store_true",
        help="Pause after each test",
    )

+    parser.addoption(
+        "--pause-at-end",
+        action="store_true",
+        help="Pause before taking munet down",
+    )
+
    parser.addoption(
        "--pause-on-error",
        action="store_true",
@ -80,6 +96,13 @@ def pytest_addoption(parser):
        help="Do not pause after (disables default when --shell or -vtysh given)",
    )

+    parser.addoption(
+        "--pcap",
+        default="",
+        metavar="NET[,NET...]",
+        help="Comma-separated list of networks to capture packets on, or 'all'",
+    )
+
    rundir_help = "directory for running in and log files"
    parser.addini("rundir", rundir_help, default="/tmp/topotests")
    parser.addoption("--rundir", metavar="DIR", help=rundir_help)
@ -135,7 +158,7 @@ def pytest_addoption(parser):


 def check_for_memleaks():
-    assert topotest_extra_config["valgrind_memleaks"]
+    assert topotest.g_pytest_config.option.valgrind_memleaks

    leaks = []
    tgen = get_topogen()  # pylint: disable=redefined-outer-name
@ -179,16 +202,15 @@ def check_for_memleaks():

@pytest.fixture(autouse=True, scope="module")
 def module_check_memtest(request):
-    del request  # disable unused warning
    yield
-    if topotest_extra_config["valgrind_memleaks"]:
+    if request.config.option.valgrind_memleaks:
        if get_topogen() is not None:
            check_for_memleaks()


 def pytest_runtest_logstart(nodeid, location):
    # location is (filename, lineno, testname)
-    topolog.logstart(nodeid, location, topotest_extra_config["rundir"])
+    topolog.logstart(nodeid, location, topotest.g_pytest_config.option.rundir)


 def pytest_runtest_logfinish(nodeid, location):
@ -199,10 +221,9 @@ def pytest_runtest_logfinish(nodeid, location):
@pytest.hookimpl(hookwrapper=True)
 def pytest_runtest_call(item: pytest.Item) -> None:
    "Hook the function that is called to execute the test."
-    del item  # disable unused warning

    # For topology only run the CLI then exit
-    if topotest_extra_config["topology_only"]:
+    if item.config.option.topology_only:
        get_topogen().cli()
        pytest.exit("exiting after --topology-only")

@ -210,7 +231,7 @@ def pytest_runtest_call(item: pytest.Item) -> None:
    yield

    # Check for leaks if requested
-    if topotest_extra_config["valgrind_memleaks"]:
+    if item.config.option.valgrind_memleaks:
        check_for_memleaks()


@ -233,6 +254,7 @@ def pytest_configure(config):
    """
    Assert that the environment is correctly configured, and get extra config.
    """
+    topotest.g_pytest_config = ConfigOptionsProxy(config)

    if config.getoption("--collect-only"):
        return
@ -254,11 +276,13 @@ def pytest_configure(config):
    # Set some defaults for the pytest.ini [pytest] section
    # ---------------------------------------------------

-    rundir = config.getoption("--rundir")
+    rundir = config.option.rundir
    if not rundir:
        rundir = config.getini("rundir")
    if not rundir:
        rundir = "/tmp/topotests"
+    config.option.rundir = rundir
+
    if not config.getoption("--junitxml"):
        config.option.xmlpath = os.path.join(rundir, "topotests.xml")
    xmlpath = config.option.xmlpath
@ -271,8 +295,6 @@ def pytest_configure(config):
        mv_path = commander.get_exec_path("mv")
        commander.cmd_status([mv_path, xmlpath, xmlpath + suffix])

-    topotest_extra_config["rundir"] = rundir
-
    # Set the log_file (exec) to inside the rundir if not specified
    if not config.getoption("--log-file") and not config.getini("log_file"):
        config.option.log_file = os.path.join(rundir, "exec.log")
@ -302,69 +324,19 @@ def pytest_configure(config):
        elif b and not is_xdist and not have_windows:
            pytest.exit("{} use requires byobu/TMUX/SCREEN/XTerm".format(feature))

-    # ---------------------------------------
-    # Record our options in global dictionary
-    # ---------------------------------------
+    #
+    # Check for window capability if given options that require window
+    #
+    assert_feature_windows(config.option.gdb_routers, "GDB")
+    assert_feature_windows(config.option.gdb_daemons, "GDB")
+    assert_feature_windows(config.option.cli_on_error, "--cli-on-error")
+    assert_feature_windows(config.option.shell, "--shell")
+    assert_feature_windows(config.option.shell_on_error, "--shell-on-error")
+    assert_feature_windows(config.option.vtysh, "--vtysh")
+    assert_feature_windows(config.option.vtysh_on_error, "--vtysh-on-error")

-    topotest_extra_config["rundir"] = rundir
-
-    asan_abort = config.getoption("--asan-abort")
-    topotest_extra_config["asan_abort"] = asan_abort
-
-    gdb_routers = config.getoption("--gdb-routers")
-    gdb_routers = gdb_routers.split(",") if gdb_routers else []
-    topotest_extra_config["gdb_routers"] = gdb_routers
-
-    gdb_daemons = config.getoption("--gdb-daemons")
-    gdb_daemons = gdb_daemons.split(",") if gdb_daemons else []
-    topotest_extra_config["gdb_daemons"] = gdb_daemons
-    assert_feature_windows(gdb_routers or gdb_daemons, "GDB")
-
-    gdb_breakpoints = config.getoption("--gdb-breakpoints")
-    gdb_breakpoints = gdb_breakpoints.split(",") if gdb_breakpoints else []
-    topotest_extra_config["gdb_breakpoints"] = gdb_breakpoints
-
-    cli_on_error = config.getoption("--cli-on-error")
-    topotest_extra_config["cli_on_error"] = cli_on_error
-    assert_feature_windows(cli_on_error, "--cli-on-error")
-
-    shell = config.getoption("--shell")
-    topotest_extra_config["shell"] = shell.split(",") if shell else []
-    assert_feature_windows(shell, "--shell")
-
-    strace = config.getoption("--strace-daemons")
-    topotest_extra_config["strace_daemons"] = strace.split(",") if strace else []
-
-    shell_on_error = config.getoption("--shell-on-error")
-    topotest_extra_config["shell_on_error"] = shell_on_error
-    assert_feature_windows(shell_on_error, "--shell-on-error")
-
-    topotest_extra_config["valgrind_extra"] = config.getoption("--valgrind-extra")
-    topotest_extra_config["valgrind_memleaks"] = config.getoption("--valgrind-memleaks")
-
-    vtysh = config.getoption("--vtysh")
-    topotest_extra_config["vtysh"] = vtysh.split(",") if vtysh else []
-    assert_feature_windows(vtysh, "--vtysh")
-
-    vtysh_on_error = config.getoption("--vtysh-on-error")
-    topotest_extra_config["vtysh_on_error"] = vtysh_on_error
-    assert_feature_windows(vtysh_on_error, "--vtysh-on-error")
-
-    pause_on_error = vtysh or shell or config.getoption("--pause-on-error")
-    if config.getoption("--no-pause-on-error"):
-        pause_on_error = False
-
-    topotest_extra_config["pause_on_error"] = pause_on_error
-    assert_feature_windows(pause_on_error, "--pause-on-error")
-
-    pause = config.getoption("--pause")
-    topotest_extra_config["pause"] = pause
-    assert_feature_windows(pause, "--pause")
-
-    topology_only = config.getoption("--topology-only")
-    if topology_only and is_xdist:
+    if config.option.topology_only and is_xdist:
        pytest.exit("Cannot use --topology-only with distributed test mode")
-    topotest_extra_config["topology_only"] = topology_only

    # Check environment now that we have config
    if not diagnose_env(rundir):
@ -430,10 +402,7 @@ def pytest_runtest_makereport(item, call):
            # We want to pause, if requested, on any error not just test cases
            # (e.g., call.when == "setup")
            if not pause:
-                pause = (
-                    topotest_extra_config["pause_on_error"]
-                    or topotest_extra_config["pause"]
-                )
+                pause = item.config.option.pause_on_error or item.config.option.pause

            # (topogen) Set topology error to avoid advancing in the test.
            tgen = get_topogen()  # pylint: disable=redefined-outer-name
@ -445,9 +414,9 @@ def pytest_runtest_makereport(item, call):
    isatty = sys.stdout.isatty()
    error_cmd = None

-    if error and topotest_extra_config["vtysh_on_error"]:
+    if error and item.config.option.vtysh_on_error:
        error_cmd = commander.get_exec_path(["vtysh"])
-    elif error and topotest_extra_config["shell_on_error"]:
+    elif error and item.config.option.shell_on_error:
        error_cmd = os.getenv("SHELL", commander.get_exec_path(["bash"]))

    if error_cmd:
@ -499,7 +468,7 @@ def pytest_runtest_makereport(item, call):
            if p.wait():
                logger.warning("xterm proc failed: %s:", proc_error(p, o, e))

-    if error and topotest_extra_config["cli_on_error"]:
+    if error and item.config.option.cli_on_error:
        # Really would like something better than using this global here.
        # Not all tests use topogen though so get_topogen() won't work.
        if Mininet.g_mnet_inst:
@ -507,23 +476,8 @@ def pytest_runtest_makereport(item, call):
        else:
            logger.error("Could not launch CLI b/c no mininet exists yet")

-    while pause and isatty:
-        try:
-            user = raw_input(
-                'PAUSED, "cli" for CLI, "pdb" to debug, "Enter" to continue: '
-            )
-        except NameError:
-            user = input('PAUSED, "cli" for CLI, "pdb" to debug, "Enter" to continue: ')
-        user = user.strip()
-
-        if user == "cli":
-            cli.cli(Mininet.g_mnet_inst)
-        elif user == "pdb":
-            pdb.set_trace()  # pylint: disable=forgotten-debug-statement
-        elif user:
-            print('Unrecognized input: "%s"' % user)
-        else:
-            break
+    if pause and isatty:
+        pause_test()


 #
--- a/tests/topotests/lib/micronet_compat.py
+++ b/tests/topotests/lib/micronet_compat.py
@ -12,6 +12,49 @@ from munet import cli
 from munet.base import BaseMunet, LinuxNamespace


+def cli_opt_list(option_list):
+    if not option_list:
+        return []
+    if isinstance(option_list, str):
+        return [x for x in option_list.split(",") if x]
+    return [x for x in option_list if x]
+
+
+def name_in_cli_opt_str(name, option_list):
+    ol = cli_opt_list(option_list)
+    return name in ol or "all" in ol
+
+
+class ConfigOptionsProxy:
+    def __init__(self, pytestconfig=None):
+        if isinstance(pytestconfig, ConfigOptionsProxy):
+            self.config = pytestconfig.config
+        else:
+            self.config = pytestconfig
+        self.option = self.config.option
+
+    def getoption(self, opt, defval=None):
+        if not self.config:
+            return defval
+
+        value = self.config.getoption(opt)
+        if value is None:
+            return defval
+
+        return value
+
+    def get_option(self, opt, defval=None):
+        return self.getoption(opt, defval)
+
+    def get_option_list(self, opt):
+        value = self.get_option(opt, "")
+        return cli_opt_list(value)
+
+    def name_in_option_list(self, name, opt):
+        optlist = self.get_option_list(opt)
+        return "all" in optlist or name in optlist
+
+
 class Node(LinuxNamespace):
    """Node (mininet compat)."""

@ -23,6 +66,8 @@ class Node(LinuxNamespace):
            nkwargs["unet"] = kwargs["unet"]
        if "private_mounts" in kwargs:
            nkwargs["private_mounts"] = kwargs["private_mounts"]
+        if "logger" in kwargs:
+            nkwargs["logger"] = kwargs["logger"]

        # This is expected by newer munet CLI code
        self.config_dirname = ""
@ -120,7 +165,7 @@ class Mininet(BaseMunet):

    g_mnet_inst = None

-    def __init__(self, rundir=None):
+    def __init__(self, rundir=None, pytestconfig=None):
        """
        Create a Micronet.
        """
@ -132,6 +177,8 @@ class Mininet(BaseMunet):
        self.host_params = {}
        self.prefix_len = 8

+        self.cfgopt = ConfigOptionsProxy(pytestconfig)
+
        # SNMPd used to require this, which was set int he mininet shell
        # that all commands executed from. This is goofy default so let's not
        # do it if we don't have to. The snmpd.conf files have been updated
@ -268,12 +315,9 @@ ff02::2\tip6-allrouters

        cli.add_cli_config(self, cdict)

-        # shellopt = (
-        #     self.pytest_config.getoption("--shell") if self.pytest_config else None
-        # )
-        # shellopt = shellopt if shellopt is not None else ""
-        # if shellopt == "all" or "." in shellopt.split(","):
-        #     self.run_in_window("bash")
+        shellopt = self.cfgopt.get_option_list("--shell")
+        if "all" in shellopt or "." in shellopt:
+            self.run_in_window("bash")

        # This is expected by newer munet CLI code
        self.config_dirname = ""
@ -335,6 +379,24 @@ ff02::2\tip6-allrouters

    def start(self):
        """Start the micronet topology."""
+        pcapopt = self.cfgopt.get_option_list("--pcap")
+        if "all" in pcapopt:
+            pcapopt = self.switches.keys()
+        for pcap in pcapopt:
+            if ":" in pcap:
+                host, intf = pcap.split(":")
+                pcap = f"{host}-{intf}"
+                host = self.hosts[host]
+            else:
+                host = self
+                intf = pcap
+            pcapfile = f"{self.rundir}/capture-{pcap}.pcap"
+            host.run_in_window(
+                f"tshark -s 9200 -i {intf} -P -w {pcapfile}",
+                background=True,
+                title=f"cap:{pcap}",
+            )
+
        self.logger.debug("%s: Starting (no-op).", self)

    def stop(self):
--- a/tests/topotests/lib/topogen.py
+++ b/tests/topotests/lib/topogen.py
@ -25,6 +25,7 @@ Basic usage instructions:
 * After running stop Mininet with: tgen.stop_topology()
 """

+import configparser
 import grp
 import inspect
 import json
@ -38,16 +39,11 @@ import subprocess
 import sys
 from collections import OrderedDict

-if sys.version_info[0] > 2:
-    import configparser
-else:
-    import ConfigParser as configparser
-
 import lib.topolog as topolog
 from lib.micronet import Commander
 from lib.micronet_compat import Mininet
 from lib.topolog import logger
-from lib.topotest import g_extra_config
+from munet.testing.util import pause_test

 from lib import topotest

@ -193,7 +189,7 @@ class Topogen(object):
        self._load_config()

        # Create new log directory
-        self.logdir = topotest.get_logs_path(g_extra_config["rundir"])
+        self.logdir = topotest.get_logs_path(topotest.g_pytest_config.option.rundir)
        subprocess.check_call(
            "mkdir -p {0} && chmod 1777 {0}".format(self.logdir), shell=True
        )
@ -213,7 +209,7 @@ class Topogen(object):
        # Mininet(Micronet) to build the actual topology.
        assert not inspect.isclass(topodef)

-        self.net = Mininet(rundir=self.logdir)
+        self.net = Mininet(rundir=self.logdir, pytestconfig=topotest.g_pytest_config)

        # Adjust the parent namespace
        topotest.fix_netns_limits(self.net)
@ -455,7 +451,18 @@ class Topogen(object):
        first is a simple kill with no sleep, the second will sleep if not
        killed and try with a different signal.
        """
+        pause = bool(self.net.cfgopt.get_option("--pause-at-end"))
+        pause = pause or bool(self.net.cfgopt.get_option("--pause"))
+        if pause:
+            try:
+                pause_test("Before MUNET delete")
+            except KeyboardInterrupt:
+                print("^C...continuing")
+            except Exception as error:
+                self.logger.error("\n...continuing after error: %s", error)
+
        logger.info("stopping topology: {}".format(self.modname))
+
        errors = ""
        for gear in self.gears.values():
            errors += gear.stop()
--- a/tests/topotests/lib/topotest.py
+++ b/tests/topotests/lib/topotest.py
@ -9,13 +9,13 @@
 # Network Device Education Foundation, Inc. ("NetDEF")
 #

+import configparser
 import difflib
 import errno
 import functools
 import glob
 import json
 import os
-import pdb
 import platform
 import re
 import resource
@ -24,22 +24,17 @@ import subprocess
 import sys
 import tempfile
 import time
+from collections.abc import Mapping
 from copy import deepcopy

 import lib.topolog as topolog
+from lib.micronet_compat import Node
 from lib.topolog import logger
-
-if sys.version_info[0] > 2:
-    import configparser
-    from collections.abc import Mapping
-else:
-    import ConfigParser as configparser
-    from collections import Mapping
+from munet.base import Timeout

 from lib import micronet
-from lib.micronet_compat import Node

-g_extra_config = {}
+g_pytest_config = None


 def get_logs_path(rundir):
@ -474,32 +469,6 @@ def int2dpid(dpid):
        )


-def pid_exists(pid):
-    "Check whether pid exists in the current process table."
-
-    if pid <= 0:
-        return False
-    try:
-        os.waitpid(pid, os.WNOHANG)
-    except:
-        pass
-    try:
-        os.kill(pid, 0)
-    except OSError as err:
-        if err.errno == errno.ESRCH:
-            # ESRCH == No such process
-            return False
-        elif err.errno == errno.EPERM:
-            # EPERM clearly means there's a process to deny access to
-            return True
-        else:
-            # According to "man 2 kill" possible error values are
-            # (EINVAL, EPERM, ESRCH)
-            raise
-    else:
-        return True
-
-
 def get_textdiff(text1, text2, title1="", title2="", **opts):
    "Returns empty string if same or formatted diff"

@ -1086,7 +1055,7 @@ def checkAddressSanitizerError(output, router, component, logdir=""):

    # No Address Sanitizer Error in Output. Now check for AddressSanitizer daemon file
    if logdir:
-        filepattern = logdir + "/" + router + "/" + component + ".asan.*"
+        filepattern = logdir + "/" + router + ".asan." + component + ".*"
        logger.debug(
            "Log check for %s on %s, pattern %s\n" % (component, router, filepattern)
        )
@ -1303,7 +1272,8 @@ def fix_host_limits():
 def setup_node_tmpdir(logdir, name):
    # Cleanup old log, valgrind, and core files.
    subprocess.check_call(
-        "rm -rf {0}/{1}.valgrind.* {1}.*.asan {0}/{1}/".format(logdir, name), shell=True
+        "rm -rf {0}/{1}.valgrind.* {0}/{1}.asan.* {0}/{1}/".format(logdir, name),
+        shell=True,
    )

    # Setup the per node directory.
@ -1339,7 +1309,7 @@ class Router(Node):
        # specified, then attempt to generate an unique logdir.
        self.logdir = params.get("logdir")
        if self.logdir is None:
-            self.logdir = get_logs_path(g_extra_config["rundir"])
+            self.logdir = get_logs_path(g_pytest_config.getoption("--rundir"))

        if not params.get("logger"):
            # If logger is present topogen has already set this up
@ -1532,7 +1502,7 @@ class Router(Node):
                )
            except Exception as ex:
                logger.error("%s can't remove IPs %s", self, str(ex))
-                # pdb.set_trace()
+                # breakpoint()
                # assert False, "can't remove IPs %s" % str(ex)

    def checkCapability(self, daemon, param):
@ -1598,10 +1568,7 @@ class Router(Node):

            if (daemon == "zebra") and (self.daemons["mgmtd"] == 0):
                # Add mgmtd with zebra - if it exists
-                try:
-                    mgmtd_path = os.path.join(self.daemondir, "mgmtd")
-                except:
-                    pdb.set_trace()
+                mgmtd_path = os.path.join(self.daemondir, "mgmtd")
                if os.path.isfile(mgmtd_path):
                    self.daemons["mgmtd"] = 1
                    self.daemons_options["mgmtd"] = ""
@ -1609,11 +1576,7 @@ class Router(Node):

            if (daemon == "zebra") and (self.daemons["staticd"] == 0):
                # Add staticd with zebra - if it exists
-                try:
-                    staticd_path = os.path.join(self.daemondir, "staticd")
-                except:
-                    pdb.set_trace()
-
+                staticd_path = os.path.join(self.daemondir, "staticd")
                if os.path.isfile(staticd_path):
                    self.daemons["staticd"] = 1
                    self.daemons_options["staticd"] = ""
@ -1688,8 +1651,7 @@ class Router(Node):
        # used
        self.cmd("echo 100000 > /proc/sys/net/mpls/platform_labels")

-        shell_routers = g_extra_config["shell"]
-        if "all" in shell_routers or self.name in shell_routers:
+        if g_pytest_config.name_in_option_list(self.name, "--shell"):
            self.run_in_window(os.getenv("SHELL", "bash"), title="sh-%s" % self.name)

        if self.daemons["eigrpd"] == 1:
@ -1706,8 +1668,7 @@ class Router(Node):

        status = self.startRouterDaemons(tgen=tgen)

-        vtysh_routers = g_extra_config["vtysh"]
-        if "all" in vtysh_routers or self.name in vtysh_routers:
+        if g_pytest_config.name_in_option_list(self.name, "--vtysh"):
            self.run_in_window("vtysh", title="vt-%s" % self.name)

        if self.unified_config:
@ -1727,13 +1688,13 @@ class Router(Node):
    def startRouterDaemons(self, daemons=None, tgen=None):
        "Starts FRR daemons for this router."

-        asan_abort = g_extra_config["asan_abort"]
-        gdb_breakpoints = g_extra_config["gdb_breakpoints"]
-        gdb_daemons = g_extra_config["gdb_daemons"]
-        gdb_routers = g_extra_config["gdb_routers"]
-        valgrind_extra = g_extra_config["valgrind_extra"]
-        valgrind_memleaks = g_extra_config["valgrind_memleaks"]
-        strace_daemons = g_extra_config["strace_daemons"]
+        asan_abort = bool(g_pytest_config.option.asan_abort)
+        gdb_breakpoints = g_pytest_config.get_option_list("--gdb-breakpoints")
+        gdb_daemons = g_pytest_config.get_option_list("--gdb-daemons")
+        gdb_routers = g_pytest_config.get_option_list("--gdb-routers")
+        valgrind_extra = bool(g_pytest_config.option.valgrind_extra)
+        valgrind_memleaks = bool(g_pytest_config.option.valgrind_memleaks)
+        strace_daemons = g_pytest_config.get_option_list("--strace-daemons")

        # Get global bundle data
        if not self.path_exists("/etc/frr/support_bundle_commands.conf"):
@ -1757,11 +1718,21 @@ class Router(Node):
        self.reportCores = True

        # XXX: glue code forward ported from removed function.
-        if self.version == None:
+        if self.version is None:
            self.version = self.cmd(
                os.path.join(self.daemondir, "bgpd") + " -v"
            ).split()[2]
            logger.info("{}: running version: {}".format(self.name, self.version))
+
+        logd_options = {}
+        for logd in g_pytest_config.get_option("--logd", []):
+            if "," in logd:
+                daemon, routers = logd.split(",", 1)
+                logd_options[daemon] = routers.split(",")
+            else:
+                daemon = logd
+                logd_options[daemon] = ["all"]
+
        # If `daemons` was specified then some upper API called us with
        # specific daemons, otherwise just use our own configuration.
        daemons_list = []
@ -1773,22 +1744,36 @@ class Router(Node):
                if self.daemons[daemon] == 1:
                    daemons_list.append(daemon)

+        tail_log_files = []
+        check_daemon_files = []
+
        def start_daemon(daemon, extra_opts=None):
            daemon_opts = self.daemons_options.get(daemon, "")
+
+            # get pid and vty filenames and remove the files
+            m = re.match(r"(.* |^)-n (\d+)( ?.*|$)", daemon_opts)
+            dfname = daemon if not m else "{}-{}".format(daemon, m.group(2))
+            runbase = "/var/run/{}/{}".format(self.routertype, dfname)
+            # If this is a new system bring-up remove the pid/vty files, otherwise
+            # do not since apparently presence of the pidfile impacts BGP GR
+            self.cmd_status("rm -f {0}.pid {0}.vty".format(runbase))
+
            rediropt = " > {0}.out 2> {0}.err".format(daemon)
            if daemon == "snmpd":
                binary = "/usr/sbin/snmpd"
                cmdenv = ""
                cmdopt = "{} -C -c /etc/frr/snmpd.conf -p ".format(
                    daemon_opts
-                ) + "/var/run/{}/snmpd.pid -x /etc/frr/agentx".format(self.routertype)
+                ) + "{}.pid -x /etc/frr/agentx".format(runbase)
+                # check_daemon_files.append(runbase + ".pid")
            else:
                binary = os.path.join(self.daemondir, daemon)
+                check_daemon_files.extend([runbase + ".pid", runbase + ".vty"])

                cmdenv = "ASAN_OPTIONS="
                if asan_abort:
-                    cmdenv = "abort_on_error=1:"
-                cmdenv += "log_path={0}/{1}.{2}.asan ".format(
+                    cmdenv += "abort_on_error=1:"
+                cmdenv += "log_path={0}/{1}.asan.{2} ".format(
                    self.logdir, self.name, daemon
                )

@ -1811,9 +1796,16 @@ class Router(Node):
                        daemon, self.logdir, self.name
                    )

-                cmdopt = "{} --command-log-always --log file:{}.log --log-level debug".format(
-                    daemon_opts, daemon
-                )
+                cmdopt = "{} --command-log-always ".format(daemon_opts)
+                cmdopt += "--log file:{}.log --log-level debug".format(daemon)
+
+                if daemon in logd_options:
+                    logdopt = logd_options[daemon]
+                    if "all" in logdopt or self.name in logdopt:
+                        tail_log_files.append(
+                            "{}/{}/{}.log".format(self.logdir, self.name, daemon)
+                        )
+
            if extra_opts:
                cmdopt += " " + extra_opts

@ -1840,8 +1832,6 @@ class Router(Node):
                logger.info(
                    "%s: %s %s launched in gdb window", self, self.routertype, daemon
                )
-                # Need better check for daemons running.
-                time.sleep(5)
            else:
                if daemon != "snmpd":
                    cmdopt += " -d "
@ -1900,16 +1890,50 @@ class Router(Node):
            start_daemon(daemon)

        # Check if daemons are running.
-        rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)
-        if re.search(r"No such file or directory", rundaemons):
-            return "Daemons are not running"
+        wait_time = 30 if (gdb_routers or gdb_daemons) else 10
+        timeout = Timeout(wait_time)
+        for remaining in timeout:
+            if not check_daemon_files:
+                break
+            check = check_daemon_files[0]
+            if self.path_exists(check):
+                check_daemon_files.pop(0)
+                continue
+            self.logger.debug("Waiting {}s for {} to appear".format(remaining, check))
+            time.sleep(0.5)
+
+        if check_daemon_files:
+            assert False, "Timeout({}) waiting for {} to appear on {}".format(
+                wait_time, check_daemon_files[0], self.name
+            )

        # Update the permissions on the log files
        self.cmd("chown frr:frr -R {}/{}".format(self.logdir, self.name))
        self.cmd("chmod ug+rwX,o+r -R {}/{}".format(self.logdir, self.name))

+        if "frr" in logd_options:
+            logdopt = logd_options["frr"]
+            if "all" in logdopt or self.name in logdopt:
+                tail_log_files.append("{}/{}/frr.log".format(self.logdir, self.name))
+
+        for tailf in tail_log_files:
+            self.run_in_window("tail -f " + tailf, title=tailf, background=True)
+
        return ""

+    def pid_exists(self, pid):
+        if pid <= 0:
+            return False
+        try:
+            # If we are not using PID namespaces then we will be a parent of the pid,
+            # otherwise the init process of the PID namespace will have reaped the proc.
+            os.waitpid(pid, os.WNOHANG)
+        except Exception:
+            pass
+
+        rc, o, e = self.cmd_status("kill -0 " + str(pid), warn=False)
+        return rc == 0 or "No such process" not in e
+
    def killRouterDaemons(
        self, daemons, wait=True, assertOnError=True, minErrorVersion="5.1"
    ):
@ -1929,15 +1953,15 @@ class Router(Node):
                    if re.search(r"%s" % daemon, d):
                        daemonpidfile = d.rstrip()
                        daemonpid = self.cmd("cat %s" % daemonpidfile).rstrip()
-                        if daemonpid.isdigit() and pid_exists(int(daemonpid)):
+                        if daemonpid.isdigit() and self.pid_exists(int(daemonpid)):
                            logger.debug(
                                "{}: killing {}".format(
                                    self.name,
                                    os.path.basename(daemonpidfile.rsplit(".", 1)[0]),
                                )
                            )
-                            os.kill(int(daemonpid), signal.SIGKILL)
-                            if pid_exists(int(daemonpid)):
+                            self.cmd_status("kill -KILL {}".format(daemonpid))
+                            if self.pid_exists(int(daemonpid)):
                                numRunning += 1
                        while wait and numRunning > 0:
                            sleep(
@ -1951,7 +1975,7 @@ class Router(Node):
                            for d in dmns[:-1]:
                                if re.search(r"%s" % daemon, d):
                                    daemonpid = self.cmd("cat %s" % d.rstrip()).rstrip()
-                                    if daemonpid.isdigit() and pid_exists(
+                                    if daemonpid.isdigit() and self.pid_exists(
                                        int(daemonpid)
                                    ):
                                        logger.info(
@ -1962,8 +1986,10 @@ class Router(Node):
                                                ),
                                            )
                                        )
-                                        os.kill(int(daemonpid), signal.SIGKILL)
-                                    if daemonpid.isdigit() and not pid_exists(
+                                        self.cmd_status(
+                                            "kill -KILL {}".format(daemonpid)
+                                        )
+                                    if daemonpid.isdigit() and not self.pid_exists(
                                        int(daemonpid)
                                    ):
                                        numRunning -= 1
--- a/tests/topotests/munet/cli.py
+++ b/tests/topotests/munet/cli.py
@ -24,8 +24,17 @@ import tempfile
 import termios
 import tty

-from . import linux
-from .config import list_to_dict_with_key
+
+try:
+    from . import linux
+    from .config import list_to_dict_with_key
+except ImportError:
+    # We cannot use relative imports and still run this module directly as a script, and
+    # there are some use cases where we want to run this file as a script.
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+    import linux
+
+    from config import list_to_dict_with_key


 ENDMARKER = b"\x00END\x00"
@ -609,7 +618,16 @@ async def doline(
        return True

    await run_command(
-        unet, outf, nline, execfmt, banner, hosts, toplevel, kinds, ns_only, interactive
+        unet,
+        outf,
+        nline,
+        execfmt,
+        banner,
+        hosts,
+        toplevel,
+        kinds,
+        ns_only,
+        interactive,
    )

    return True
@ -657,10 +675,10 @@ async def cli_client(sockpath, prompt="munet> "):

 async def local_cli(unet, outf, prompt, histfile, background):
    """Implement the user-side CLI for local munet."""
-    if unet:
-        completer = Completer(unet)
-        readline.parse_and_bind("tab: complete")
-        readline.set_completer(completer.complete)
+    assert unet is not None
+    completer = Completer(unet)
+    readline.parse_and_bind("tab: complete")
+    readline.set_completer(completer.complete)

    print("\n--- Munet CLI Starting ---\n\n")
    while True:
@ -669,8 +687,6 @@ async def local_cli(unet, outf, prompt, histfile, background):
            if line is None:
                return

-            assert unet is not None
-
            if not await doline(unet, line, outf, background):
                return
        except KeyboardInterrupt:
@ -706,10 +722,19 @@ async def cli_client_connected(unet, background, reader, writer):
            break
        line = line.decode("utf-8").strip()

-        # def writef(x):
-        #     writer.write(x.encode("utf-8"))
+        class EncodingFile:
+            """Wrap a writer to encode in utf-8."""

-        if not await doline(unet, line, writer, background, notty=True):
+            def __init__(self, writer):
+                self.writer = writer
+
+            def write(self, x):
+                self.writer.write(x.encode("utf-8"))
+
+            def flush(self):
+                self.writer.flush()
+
+        if not await doline(unet, line, EncodingFile(writer), background, notty=True):
            logging.debug("server closing cli connection")
            return

--- a/tests/topotests/munet/mutini.py
+++ b/tests/topotests/munet/mutini.py
@ -296,8 +296,12 @@ def be_init(new_pg, exec_args):
            # No exec so we are the "child".
            new_process_group()

+        # Reap children as init process
+        vdebug("installing local handler for SIGCHLD")
+        signal.signal(signal.SIGCHLD, sig_sigchld)
+
        while True:
-            logging.info("parent: waiting to reap zombies")
+            logging.info("init: waiting to reap zombies")
            linux.pause()
        # NOTREACHED