Merge pull request #4988 from donaldsharp/watchfrr_ignore_daemon

Watchfrr ignore daemon
This commit is contained in:
Russ White 2019-09-17 07:48:14 -04:00 committed by GitHub
commit f0b7ed8823
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 92 additions and 2 deletions

View File

@ -58,6 +58,7 @@ Protocols
vnc vnc
vrrp vrrp
bmp bmp
watchfrr
######## ########
Appendix Appendix

View File

@ -43,6 +43,7 @@ user_RSTFILES = \
doc/user/zebra.rst \ doc/user/zebra.rst \
doc/user/bfd.rst \ doc/user/bfd.rst \
doc/user/flowspec.rst \ doc/user/flowspec.rst \
doc/user/watchfrr.rst \
# end # end
EXTRA_DIST += \ EXTRA_DIST += \

30
doc/user/watchfrr.rst Normal file
View File

@ -0,0 +1,30 @@
.. _watchfrr:
********
WATCHFRR
********
:abbr:`WATCHFRR` is a daemon that handles failed daemon processes and
intelligently restarts them as needed.
Starting WATCHFRR
=================
WATCHFRR is started as per normal systemd startup and typically does not
require end users management.
WATCHFRR commands
=================
.. index:: show watchfrr
.. clicmd:: show watchfrr
Give status information about the state of the different daemons being
watched by WATCHFRR
.. index:: [no] watchfrr ignore DAEMON
.. clicmd:: [no] watchfrr ignore DAEMON
Tell WATCHFRR to ignore a particular DAEMON if it goes unresponsive.
This is particularly useful when you are a developer and need to debug
a working system, without watchfrr pulling the rug out from under you.

View File

@ -19,3 +19,6 @@ watchfrr_watchfrr_SOURCES = \
watchfrr/watchfrr_errors.c \ watchfrr/watchfrr_errors.c \
watchfrr/watchfrr_vty.c \ watchfrr/watchfrr_vty.c \
# end # end
watchfrr/watchfrr_vty_clippy.c: $(CLIPPY_DEPS)
watchfrr/watchfrr_vty.$(OBJEXT): watchfrr/watchfrr_vty_clippy.c

View File

@ -159,6 +159,15 @@ struct daemon {
struct thread *t_write; struct thread *t_write;
struct daemon *next; struct daemon *next;
struct restart_info restart; struct restart_info restart;
/*
* For a given daemon, if we've turned on ignore timeouts
* ignore the timeout value and assume everything is ok
* This is for daemon debugging w/ gdb after we have started
* FRR and realize we have something that needs to be looked
* at
*/
bool ignore_timeout;
}; };
#define OPTION_MINRESTART 2000 #define OPTION_MINRESTART 2000
@ -191,6 +200,25 @@ static void phase_check(void);
static void restart_done(struct daemon *dmn); static void restart_done(struct daemon *dmn);
static const char *progname; static const char *progname;
void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname, bool ignore)
{
struct daemon *dmn;
for (dmn = gs.daemons; dmn; dmn = dmn->next) {
if (strncmp(dmn->name, dname, strlen(dmn->name)) == 0)
break;
}
if (dmn) {
dmn->ignore_timeout = ignore;
vty_out(vty, "%s switching to %s\n", dmn->name,
ignore ? "ignore" : "watch");
} else
vty_out(vty, "%s is not configured for running at the moment",
dname);
}
static void printhelp(FILE *target) static void printhelp(FILE *target)
{ {
fprintf(target, fprintf(target,
@ -533,7 +561,9 @@ static int wakeup_init(struct thread *t_wakeup)
static void restart_done(struct daemon *dmn) static void restart_done(struct daemon *dmn)
{ {
if (dmn->state != DAEMON_DOWN) { if (dmn->state != DAEMON_DOWN) {
zlog_warn("wtf?"); zlog_warn(
"Daemon: %s: is in %s state but expected it to be in DAEMON_DOWN state",
dmn->name, state_str[dmn->state]);
return; return;
} }
if (dmn->t_wakeup) if (dmn->t_wakeup)
@ -961,6 +991,8 @@ static int wakeup_no_answer(struct thread *t_wakeup)
dmn->t_wakeup = NULL; dmn->t_wakeup = NULL;
dmn->state = DAEMON_UNRESPONSIVE; dmn->state = DAEMON_UNRESPONSIVE;
if (dmn->ignore_timeout)
return 0;
flog_err(EC_WATCHFRR_CONNECTION, flog_err(EC_WATCHFRR_CONNECTION,
"%s state -> unresponsive : no response yet to ping " "%s state -> unresponsive : no response yet to ping "
"sent %ld seconds ago", "sent %ld seconds ago",
@ -1014,7 +1046,8 @@ void watchfrr_status(struct vty *vty)
(long)gs.restart.pid); (long)gs.restart.pid);
for (dmn = gs.daemons; dmn; dmn = dmn->next) { for (dmn = gs.daemons; dmn; dmn = dmn->next) {
vty_out(vty, " %-20s %s\n", dmn->name, state_str[dmn->state]); vty_out(vty, " %-20s %s%s", dmn->name, state_str[dmn->state],
dmn->ignore_timeout ? "/Ignoring Timeout\n" : "\n");
if (dmn->restart.pid) if (dmn->restart.pid)
vty_out(vty, " restart running, pid %ld\n", vty_out(vty, " restart running, pid %ld\n",
(long)dmn->restart.pid); (long)dmn->restart.pid);

View File

@ -41,4 +41,6 @@ extern void watchfrr_status(struct vty *vty);
*/ */
extern bool check_all_up(void); extern bool check_all_up(void);
extern void watchfrr_set_ignore_daemon(struct vty *vty, const char *dname,
bool ignore);
#endif /* FRR_WATCHFRR_H */ #endif /* FRR_WATCHFRR_H */

View File

@ -134,6 +134,23 @@ DEFUN (show_watchfrr,
return CMD_SUCCESS; return CMD_SUCCESS;
} }
#ifndef VTYSH_EXTRACT_PL
#include "watchfrr/watchfrr_vty_clippy.c"
#endif
DEFPY (watchfrr_ignore_daemon,
watchfrr_ignore_daemon_cmd,
"[no] watchfrr ignore DAEMON$dname",
NO_STR
"Watchfrr Specific sub-command\n"
"Ignore a specified daemon when it does not respond to echo request\n"
"The daemon to ignore\n")
{
watchfrr_set_ignore_daemon(vty, dname, no ? false : true );
return CMD_SUCCESS;
}
void integrated_write_sigchld(int status) void integrated_write_sigchld(int status)
{ {
uint8_t reply[4] = {0, 0, 0, CMD_WARNING}; uint8_t reply[4] = {0, 0, 0, CMD_WARNING};
@ -168,6 +185,9 @@ void watchfrr_vty_init(void)
integrated_write_pid = -1; integrated_write_pid = -1;
install_element(ENABLE_NODE, &config_write_integrated_cmd); install_element(ENABLE_NODE, &config_write_integrated_cmd);
install_element(ENABLE_NODE, &show_debugging_watchfrr_cmd); install_element(ENABLE_NODE, &show_debugging_watchfrr_cmd);
install_element(ENABLE_NODE, &watchfrr_ignore_daemon_cmd);
install_element(CONFIG_NODE, &show_debugging_watchfrr_cmd); install_element(CONFIG_NODE, &show_debugging_watchfrr_cmd);
install_element(VIEW_NODE, &show_watchfrr_cmd); install_element(VIEW_NODE, &show_watchfrr_cmd);
} }