watchfrr: Allow end users to turn off watchfrr for a particular daemon

Allow an end user who is debugging behavior, with say gdb, to turn
off watchfrr and it's attempts to keep control of a daemons up/responsiveness

With code change:
donna.cumulusnetworks.com# show watchfrr
watchfrr global phase: Idle
  zebra                Up
  bgpd                 Up/Ignoring Timeout
  staticd              Up

Now grab bgpd with gdb:

sharpd@donna ~/frr4> date ; sudo gdb -p 27893
Mon 16 Sep 2019 01:44:57 PM EDT
GNU gdb (GDB) Fedora 8.3-6.fc30
Copyright (C) 2019 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word".
Attaching to process 27893
[New LWP 27894]
[New LWP 27895]
[New LWP 27896]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007f1787a3e5c7 in poll () from /lib64/libc.so.6
Missing separate debuginfos, use: dnf debuginfo-install glibc-2.29-15.fc30.x86_64 gperftools-libs-2.7-5.fc30.x86_64 json-c-0.13.1-4.fc30.x86_64 libcap-2.26-5.fc30.x86_64 libgcc-9.1.1-1.fc30.x86_64 libgcrypt-1.8.4-3.fc30.x86_64 libgpg-error-1.33-2.fc30.x86_64 libstdc++-9.1.1-1.fc30.x86_64 libxcrypt-4.4.6-2.fc30.x86_64 libyang-0.16.105-1.fc30.x86_64 lua-libs-5.3.5-5.fc30.x86_64 lz4-libs-1.8.3-2.fc30.x86_64 pcre-8.43-2.fc30.x86_64 xz-libs-5.2.4-5.fc30.x86_64
(gdb)

In another window we can see when watchfrr thinks it's not responding:

donna.cumulusnetworks.com# show watchfrr
watchfrr global phase: Idle
  zebra                Up
  bgpd                 Unresponsive/Ignoring Timeout
  staticd              Up

Finally exit gdb and watchfrr now believes bgpd is good to go again:

donna.cumulusnetworks.com# show watchfrr
watchfrr global phase: Idle
  zebra                Up
  bgpd                 Up/Ignoring Timeout
  staticd              Up

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
This commit is contained in:
Donald Sharp 2019-09-16 13:47:50 -04:00
parent 8e0bab8a1e
commit cc53b605e6
4 changed files with 57 additions and 1 deletions

View File

@ -19,3 +19,6 @@ watchfrr_watchfrr_SOURCES = \
watchfrr/watchfrr_errors.c \
watchfrr/watchfrr_vty.c \
# end
watchfrr/watchfrr_vty_clippy.c: $(CLIPPY_DEPS)
watchfrr/watchfrr_vty.$(OBJEXT): watchfrr/watchfrr_vty_clippy.c

View File

@ -159,6 +159,15 @@ struct daemon {
struct thread *t_write;
struct daemon *next;
struct restart_info restart;
/*
* For a given daemon, if we've turned on ignore timeouts
* ignore the timeout value and assume everything is ok
* This is for daemon debugging w/ gdb after we have started
* FRR and realize we have something that needs to be looked
* at
*/
bool ignore_timeout;
};
#define OPTION_MINRESTART 2000
@ -191,6 +200,25 @@ static void phase_check(void);
static void restart_done(struct daemon *dmn);
static const char *progname;
void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
{
struct daemon *dmn;
for (dmn = gs.daemons; dmn; dmn = dmn->next) {
if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
break;
}
if (dmn) {
dmn->ignore_timeout = ignore;
vty_out(vty, "%s switching to %s\n", dmn->name,
ignore ? "ignore" : "watch");
} else
vty_out(vty, "%s is not configured for running at the moment",
dname);
}
static void printhelp(FILE *target)
{
fprintf(target,
@ -961,6 +989,8 @@ static int wakeup_no_answer(struct thread *t_wakeup)
dmn->t_wakeup = NULL;
dmn->state = DAEMON_UNRESPONSIVE;
if (dmn->ignore_timeout)
return 0;
flog_err(EC_WATCHFRR_CONNECTION,
"%s state -> unresponsive : no response yet to ping "
"sent %ld seconds ago",
@ -1014,7 +1044,8 @@ void watchfrr_status(struct vty *vty)
(long)gs.restart.pid);
for (dmn = gs.daemons; dmn; dmn = dmn->next) {
vty_out(vty, " %-20s %s\n", dmn->name, state_str[dmn->state]);
vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
if (dmn->restart.pid)
vty_out(vty, " restart running, pid %ld\n",
(long)dmn->restart.pid);

View File

@ -41,4 +41,6 @@ extern void watchfrr_status(struct vty *vty);
*/
extern bool check_all_up(void);
extern void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname,
bool ignore);
#endif /* FRR_WATCHFRR_H */

View File

@ -134,6 +134,23 @@ DEFUN (show_watchfrr,
return CMD_SUCCESS;
}
#ifndef VTYSH_EXTRACT_PL
#include "watchfrr/watchfrr_vty_clippy.c"
#endif
DEFPY (watchfrr_ignore_daemon,
watchfrr_ignore_daemon_cmd,
"[no] watchfrr ignore DAEMON$dname",
NO_STR
"Watchfrr Specific sub-command\n"
"Ignore a specified daemon when it does not respond to echo request\n"
"The daemon to ignore\n")
{
watchfrr_set_ignore_daemon(vty, dname, no ? false : true );
return CMD_SUCCESS;
}
void integrated_write_sigchld(int status)
{
uint8_t reply[4] = {0, 0, 0, CMD_WARNING};
@ -168,6 +185,9 @@ void watchfrr_vty_init(void)
integrated_write_pid = -1;
install_element(ENABLE_NODE, &config_write_integrated_cmd);
install_element(ENABLE_NODE, &show_debugging_watchfrr_cmd);
install_element(ENABLE_NODE, &watchfrr_ignore_daemon_cmd);
install_element(CONFIG_NODE, &show_debugging_watchfrr_cmd);
install_element(VIEW_NODE, &show_watchfrr_cmd);
}