diff --git a/doc/user/index.rst b/doc/user/index.rst index 6c3b14e062..416c51f13b 100644 --- a/doc/user/index.rst +++ b/doc/user/index.rst @@ -58,6 +58,7 @@ Protocols vnc vrrp bmp + watchfrr ######## Appendix diff --git a/doc/user/subdir.am b/doc/user/subdir.am index 0f0a8a0774..ce519fbfbf 100644 --- a/doc/user/subdir.am +++ b/doc/user/subdir.am @@ -43,6 +43,7 @@ user_RSTFILES = \ doc/user/zebra.rst \ doc/user/bfd.rst \ doc/user/flowspec.rst \ + doc/user/watchfrr.rst \ # end EXTRA_DIST += \ diff --git a/doc/user/watchfrr.rst b/doc/user/watchfrr.rst new file mode 100644 index 0000000000..df04a1e375 --- /dev/null +++ b/doc/user/watchfrr.rst @@ -0,0 +1,30 @@ +.. _watchfrr: + +******** +WATCHFRR +******** + +:abbr:`WATCHFRR` is a daemon that handles failed daemon processes and +intelligently restarts them as needed. + +Starting WATCHFRR +================= + +WATCHFRR is started as per normal systemd startup and typically does not +require end users management. + +WATCHFRR commands +================= + +.. index:: show watchfrr +.. clicmd:: show watchfrr + + Give status information about the state of the different daemons being + watched by WATCHFRR + +.. index:: [no] watchfrr ignore DAEMON +.. clicmd:: [no] watchfrr ignore DAEMON + + Tell WATCHFRR to ignore a particular DAEMON if it goes unresponsive. + This is particularly useful when you are a developer and need to debug + a working system, without watchfrr pulling the rug out from under you. diff --git a/watchfrr/subdir.am b/watchfrr/subdir.am index c27491e55c..30f606c202 100644 --- a/watchfrr/subdir.am +++ b/watchfrr/subdir.am @@ -19,3 +19,6 @@ watchfrr_watchfrr_SOURCES = \ watchfrr/watchfrr_errors.c \ watchfrr/watchfrr_vty.c \ # end + +watchfrr/watchfrr_vty_clippy.c: $(CLIPPY_DEPS) +watchfrr/watchfrr_vty.$(OBJEXT): watchfrr/watchfrr_vty_clippy.c diff --git a/watchfrr/watchfrr.c b/watchfrr/watchfrr.c index c17d381730..a6a910a1db 100644 --- a/watchfrr/watchfrr.c +++ b/watchfrr/watchfrr.c @@ -159,6 +159,15 @@ struct daemon { struct thread *t_write; struct daemon *next; struct restart_info restart; + + /* + * For a given daemon, if we've turned on ignore timeouts + * ignore the timeout value and assume everything is ok + * This is for daemon debugging w/ gdb after we have started + * FRR and realize we have something that needs to be looked + * at + */ + bool ignore_timeout; }; #define OPTION_MINRESTART 2000 @@ -191,6 +200,25 @@ static void phase_check(void); static void restart_done(struct daemon *dmn); static const char *progname; + +void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore) +{ + struct daemon *dmn; + + for (dmn = gs.daemons; dmn; dmn = dmn->next) { + if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0) + break; + } + + if (dmn) { + dmn->ignore_timeout = ignore; + vty_out(vty, "%s switching to %s\n", dmn->name, + ignore ? "ignore" : "watch"); + } else + vty_out(vty, "%s is not configured for running at the moment", + dname); +} + static void printhelp(FILE *target) { fprintf(target, @@ -533,7 +561,9 @@ static int wakeup_init(struct thread *t_wakeup) static void restart_done(struct daemon *dmn) { if (dmn->state != DAEMON_DOWN) { - zlog_warn("wtf?"); + zlog_warn( + "Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state", + dmn->name, state_str[dmn->state]); return; } if (dmn->t_wakeup) @@ -961,6 +991,8 @@ static int wakeup_no_answer(struct thread *t_wakeup) dmn->t_wakeup = NULL; dmn->state = DAEMON_UNRESPONSIVE; + if (dmn->ignore_timeout) + return 0; flog_err(EC_WATCHFRR_CONNECTION, "%s state -> unresponsive : no response yet to ping " "sent %ld seconds ago", @@ -1014,7 +1046,8 @@ void watchfrr_status(struct vty *vty) (long)gs.restart.pid); for (dmn = gs.daemons; dmn; dmn = dmn->next) { - vty_out(vty, " %-20s %s\n", dmn->name, state_str[dmn->state]); + vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state], + dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n"); if (dmn->restart.pid) vty_out(vty, " restart running, pid %ld\n", (long)dmn->restart.pid); diff --git a/watchfrr/watchfrr.h b/watchfrr/watchfrr.h index c5f54769bd..ba6e94960f 100644 --- a/watchfrr/watchfrr.h +++ b/watchfrr/watchfrr.h @@ -41,4 +41,6 @@ extern void watchfrr_status(struct vty *vty); */ extern bool check_all_up(void); +extern void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, + bool ignore); #endif /* FRR_WATCHFRR_H */ diff --git a/watchfrr/watchfrr_vty.c b/watchfrr/watchfrr_vty.c index 9b844d67f2..c06cb89382 100644 --- a/watchfrr/watchfrr_vty.c +++ b/watchfrr/watchfrr_vty.c @@ -134,6 +134,23 @@ DEFUN (show_watchfrr, return CMD_SUCCESS; } +#ifndef VTYSH_EXTRACT_PL +#include "watchfrr/watchfrr_vty_clippy.c" +#endif + +DEFPY (watchfrr_ignore_daemon, + watchfrr_ignore_daemon_cmd, + "[no] watchfrr ignore DAEMON$dname", + NO_STR + "Watchfrr Specific sub-command\n" + "Ignore a specified daemon when it does not respond to echo request\n" + "The daemon to ignore\n") +{ + watchfrr_set_ignore_daemon(vty, dname, no ? false : true ); + + return CMD_SUCCESS; +} + void integrated_write_sigchld(int status) { uint8_t reply[4] = {0, 0, 0, CMD_WARNING}; @@ -168,6 +185,9 @@ void watchfrr_vty_init(void) integrated_write_pid = -1; install_element(ENABLE_NODE, &config_write_integrated_cmd); install_element(ENABLE_NODE, &show_debugging_watchfrr_cmd); + + install_element(ENABLE_NODE, &watchfrr_ignore_daemon_cmd); + install_element(CONFIG_NODE, &show_debugging_watchfrr_cmd); install_element(VIEW_NODE, &show_watchfrr_cmd); }