mirror of
https://git.proxmox.com/git/mirror_corosync
synced 2025-08-03 10:50:54 +00:00
Add monitoring and watchdog services.
git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@3053 fd59a12c-fef9-0310-b244-a6a79926bd2f
This commit is contained in:
parent
72addbc4cd
commit
07d06c0c0f
36
configure.ac
36
configure.ac
@ -252,6 +252,16 @@ AC_ARG_ENABLE([rdma],
|
||||
[ enable_rdma="no" ])
|
||||
AM_CONDITIONAL(BUILD_RDMA, test x$enable_rdma = xyes)
|
||||
|
||||
AC_ARG_ENABLE([monitoring],
|
||||
[ --enable-monitoring : resource monitoring ],,
|
||||
[ default="no" ])
|
||||
AM_CONDITIONAL(BUILD_MONITORING, test x$enable_monitoring = xyes)
|
||||
|
||||
AC_ARG_ENABLE([watchdog],
|
||||
[ --enable-watchdog : Watchdog support ],,
|
||||
[ edefault="no" ])
|
||||
AM_CONDITIONAL(BUILD_WATCHDOG, test x$enable_watchdog = xyes)
|
||||
|
||||
AC_ARG_ENABLE([augeas],
|
||||
[ --enable-augeas : Install the augeas lens for corosync.conf ],,
|
||||
[ enable_augeas="no" ])
|
||||
@ -277,6 +287,7 @@ AC_ARG_WITH([socket-dir],
|
||||
# THIS SECTION MUST DIE!
|
||||
CP=cp
|
||||
OS_LDL="-ldl"
|
||||
have_linux="no"
|
||||
case "$host_os" in
|
||||
*linux*)
|
||||
AC_DEFINE_UNQUOTED([COROSYNC_LINUX], [1],
|
||||
@ -286,6 +297,7 @@ case "$host_os" in
|
||||
OS_LDFLAGS=""
|
||||
OS_DYFLAGS="-rdynamic"
|
||||
DARWIN_OPTS=""
|
||||
have_linux="yes"
|
||||
;;
|
||||
darwin*)
|
||||
AC_DEFINE_UNQUOTED([COROSYNC_DARWIN], [1],
|
||||
@ -387,6 +399,30 @@ if test "x${enable_rdma}" = xyes; then
|
||||
PACKAGE_FEATURES="$PACKAGE_FEATURES rdma"
|
||||
fi
|
||||
|
||||
if test "x${enable_monitoring}" = xyes; then
|
||||
|
||||
AC_CHECK_LIB([statgrab], [sg_get_mem_stats], have_libstatgrab="yes", have_libstatgrab="no")
|
||||
|
||||
if test "x${have_libstatgrab}" = xyes; then
|
||||
AC_DEFINE_UNQUOTED([HAVE_LIBSTATGRAB], 1, [have libstatgrab])
|
||||
statgrab_LIBS="-lstatgrab"
|
||||
else
|
||||
if test "x${have_linux}" = xno; then
|
||||
AC_MSG_ERROR(monitoring requires libstatgrab on non-linux systems)
|
||||
fi
|
||||
fi
|
||||
AC_SUBST([statgrab_LIBS])
|
||||
AC_DEFINE_UNQUOTED([HAVE_MONITORING], 1, [have resource monitoring])
|
||||
PACKAGE_FEATURES="$PACKAGE_FEATURES monitoring"
|
||||
fi
|
||||
|
||||
if test "x${enable_watchdog}" = xyes; then
|
||||
AC_CHECK_HEADER(linux/watchdog.h,,AC_MSG_ERROR(watchdog requires linux/watchdog.h))
|
||||
AC_CHECK_HEADER(linux/reboot.h,,AC_MSG_ERROR(watchdog requires linux/reboot.h))
|
||||
AC_DEFINE_UNQUOTED([HAVE_WATCHDOG], 1, [have watchdog])
|
||||
PACKAGE_FEATURES="$PACKAGE_FEATURES watchdog"
|
||||
fi
|
||||
|
||||
if test "x${enable_augeas}" = xyes; then
|
||||
PACKAGE_FEATURES="$PACKAGE_FEATURES augeas"
|
||||
fi
|
||||
|
@ -5,6 +5,8 @@
|
||||
# Invoke "rpmbuild --without <feature>" or "rpmbuild --with <feature>"
|
||||
# to disable or enable specific features
|
||||
%bcond_with testagents
|
||||
%bcond_with watchdog
|
||||
%bcond_with monitoring
|
||||
|
||||
Name: corosync
|
||||
Summary: The Corosync Cluster Engine and Application Programming Interfaces
|
||||
@ -51,6 +53,12 @@ export rdmacm_LIBS=-lrdmacm \
|
||||
--enable-nss \
|
||||
%if %{with testagents}
|
||||
--enable-testagents \
|
||||
%endif
|
||||
%if %{with watchdog}
|
||||
--enable-watchdog \
|
||||
%endif
|
||||
%if %{with monitoring}
|
||||
--enable-monitoring \
|
||||
%endif
|
||||
--enable-rdma \
|
||||
--with-initddir=%{_initrddir}
|
||||
@ -115,6 +123,12 @@ fi
|
||||
%{_libexecdir}/lcrso/quorum_testquorum.lcrso
|
||||
%{_libexecdir}/lcrso/vsf_quorum.lcrso
|
||||
%{_libexecdir}/lcrso/vsf_ykd.lcrso
|
||||
%if %{with watchdog}
|
||||
%{_libexecdir}/lcrso/service_wd.lcrso
|
||||
%endif
|
||||
%if %{with monitoring}
|
||||
%{_libexecdir}/lcrso/service_mon.lcrso
|
||||
%endif
|
||||
%dir %{_localstatedir}/lib/corosync
|
||||
%dir %{_localstatedir}/log/cluster
|
||||
%{_mandir}/man8/corosync_overview.8*
|
||||
|
@ -59,7 +59,7 @@ libcoroipcs_a_SOURCES = $(COROIPCS_SRC)
|
||||
corosync_SOURCES = main.c util.c sync.c apidef.c service.c \
|
||||
timer.c totemconfig.c mainconfig.c quorum.c schedwrk.c \
|
||||
../lcr/lcr_ifact.c evil.c syncv2.c
|
||||
corosync_LDADD = -ltotem_pg -llogsys -lcoroipcs
|
||||
corosync_LDADD = -ltotem_pg -llogsys -lcoroipcs $(statgrab_LIBS)
|
||||
corosync_DEPENDENCIES = libtotem_pg.so.$(SONAME) liblogsys.so.$(SONAME) libcoroipcs.so.$(SONAME)
|
||||
corosync_LDFLAGS = $(OS_DYFLAGS) -L./
|
||||
|
||||
|
@ -85,6 +85,18 @@ static struct default_service default_services[] = {
|
||||
.name = "corosync_pload",
|
||||
.ver = 0,
|
||||
},
|
||||
#ifdef HAVE_MONITORING
|
||||
{
|
||||
.name = "corosync_mon",
|
||||
.ver = 0,
|
||||
},
|
||||
#endif
|
||||
#ifdef HAVE_WATCHDOG
|
||||
{
|
||||
.name = "corosync_wd",
|
||||
.ver = 0,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.name = "corosync_quorum",
|
||||
.ver = 0,
|
||||
|
@ -59,7 +59,9 @@ enum corosync_service_types {
|
||||
NTF_SERVICE = 16,
|
||||
AMF_V2_SERVICE = 17,
|
||||
TST_SV1_SERVICE = 18,
|
||||
TST_SV2_SERVICE = 19
|
||||
TST_SV2_SERVICE = 19,
|
||||
MON_SERVICE = 20,
|
||||
WD_SERVICE = 21
|
||||
};
|
||||
|
||||
#ifdef HAVE_SMALL_MEMORY_FOOTPRINT
|
||||
|
@ -38,6 +38,12 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \
|
||||
-I$(top_srcdir)/include/corosync
|
||||
|
||||
SERVICE_LCRSO = evs cfg cpg confdb pload
|
||||
if BUILD_WATCHDOG
|
||||
SERVICE_LCRSO += wd
|
||||
endif
|
||||
if BUILD_MONITORING
|
||||
SERVICE_LCRSO += mon
|
||||
endif
|
||||
|
||||
QUORUM_LCRSO = votequorum testquorum
|
||||
|
||||
|
635
services/mon.c
Normal file
635
services/mon.c
Normal file
@ -0,0 +1,635 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Red Hat, Inc.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Author: Angus Salkeld <asalkeld@redhat.com>
|
||||
*
|
||||
* This software licensed under BSD license, the text of which follows:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
* - Neither the name of the MontaVista Software, Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#if defined(HAVE_LIBSTATGRAB)
|
||||
#include <statgrab.h>
|
||||
#endif
|
||||
|
||||
#include <corosync/corotypes.h>
|
||||
#include <corosync/corodefs.h>
|
||||
#include <corosync/lcr/lcr_comp.h>
|
||||
#include <corosync/engine/coroapi.h>
|
||||
#include <corosync/list.h>
|
||||
#include <corosync/totem/coropoll.h>
|
||||
#include <corosync/engine/logsys.h>
|
||||
#include "../exec/fsm.h"
|
||||
|
||||
|
||||
LOGSYS_DECLARE_SUBSYS ("MON");
|
||||
|
||||
#undef ENTER
|
||||
#define ENTER() log_printf (LOGSYS_LEVEL_INFO, "%s", __func__)
|
||||
|
||||
/*
|
||||
* Service Interfaces required by service_message_handler struct
|
||||
*/
|
||||
static int mon_exec_init_fn (
|
||||
struct corosync_api_v1 *corosync_api);
|
||||
|
||||
hdb_handle_t mon_poll = 0;
|
||||
static struct corosync_api_v1 *api;
|
||||
static hdb_handle_t resources_obj;
|
||||
static pthread_t mon_poll_thread;
|
||||
#define MON_DEFAULT_PERIOD 3
|
||||
|
||||
struct corosync_service_engine mon_service_engine = {
|
||||
.name = "corosync resource monitoring service",
|
||||
.id = MON_SERVICE,
|
||||
.priority = 1,
|
||||
.private_data_size = 0,
|
||||
.flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
|
||||
.lib_init_fn = NULL,
|
||||
.lib_exit_fn = NULL,
|
||||
.lib_engine = NULL,
|
||||
.lib_engine_count = 0,
|
||||
.exec_engine = NULL,
|
||||
.exec_engine_count = 0,
|
||||
.confchg_fn = NULL,
|
||||
.exec_init_fn = mon_exec_init_fn,
|
||||
.exec_dump_fn = NULL,
|
||||
.sync_mode = CS_SYNC_V2
|
||||
};
|
||||
|
||||
static DECLARE_LIST_INIT (confchg_notify);
|
||||
|
||||
|
||||
struct resource_instance {
|
||||
hdb_handle_t handle;
|
||||
const char *name;
|
||||
poll_timer_handle timer_handle;
|
||||
void (*update_stats_fn) (void *data);
|
||||
struct cs_fsm fsm;
|
||||
int32_t period;
|
||||
objdb_value_types_t max_type;
|
||||
union {
|
||||
int32_t int32;
|
||||
double dbl;
|
||||
} max;
|
||||
};
|
||||
|
||||
static void mem_update_stats_fn (void *data);
|
||||
static void load_update_stats_fn (void *data);
|
||||
|
||||
static struct resource_instance memory_used_inst = {
|
||||
.name = "memory_used",
|
||||
.update_stats_fn = mem_update_stats_fn,
|
||||
.max_type = OBJDB_VALUETYPE_INT32,
|
||||
.max.int32 = INT32_MAX,
|
||||
.period = MON_DEFAULT_PERIOD,
|
||||
};
|
||||
|
||||
static struct resource_instance load_15min_inst = {
|
||||
.name = "load_15min",
|
||||
.update_stats_fn = load_update_stats_fn,
|
||||
.max_type = OBJDB_VALUETYPE_DOUBLE,
|
||||
.max.dbl = INT32_MAX,
|
||||
.period = MON_DEFAULT_PERIOD,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* F S M
|
||||
*/
|
||||
static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
|
||||
static void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
|
||||
|
||||
const char * mon_ok_str = "ok";
|
||||
const char * mon_failed_str = "failed";
|
||||
const char * mon_failure_str = "failure";
|
||||
const char * mon_disabled_str = "disabled";
|
||||
const char * mon_config_changed_str = "config_changed";
|
||||
|
||||
enum mon_resource_state {
|
||||
MON_S_DISABLED,
|
||||
MON_S_OK,
|
||||
MON_S_FAILED
|
||||
};
|
||||
enum mon_resource_event {
|
||||
MON_E_CONFIG_CHANGED,
|
||||
MON_E_FAILURE
|
||||
};
|
||||
|
||||
struct cs_fsm_entry mon_fsm_table[] = {
|
||||
{ MON_S_DISABLED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_DISABLED, MON_S_OK, -1} },
|
||||
{ MON_S_DISABLED, MON_E_FAILURE, NULL, {-1} },
|
||||
{ MON_S_OK, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} },
|
||||
{ MON_S_OK, MON_E_FAILURE, mon_resource_failed, {MON_S_FAILED, -1} },
|
||||
{ MON_S_FAILED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} },
|
||||
{ MON_S_FAILED, MON_E_FAILURE, NULL, {-1} },
|
||||
};
|
||||
|
||||
/*
|
||||
* Dynamic loading descriptor
|
||||
*/
|
||||
|
||||
static struct corosync_service_engine *mon_get_service_engine_ver0 (void);
|
||||
|
||||
static struct corosync_service_engine_iface_ver0 mon_service_engine_iface = {
|
||||
.corosync_get_service_engine_ver0 = mon_get_service_engine_ver0
|
||||
};
|
||||
|
||||
static struct lcr_iface corosync_mon_ver0[1] = {
|
||||
{
|
||||
.name = "corosync_mon",
|
||||
.version = 0,
|
||||
.versions_replace = 0,
|
||||
.versions_replace_count = 0,
|
||||
.dependencies = 0,
|
||||
.dependency_count = 0,
|
||||
.constructor = NULL,
|
||||
.destructor = NULL,
|
||||
.interfaces = NULL,
|
||||
}
|
||||
};
|
||||
|
||||
static struct lcr_comp mon_comp_ver0 = {
|
||||
.iface_count = 1,
|
||||
.ifaces = corosync_mon_ver0
|
||||
};
|
||||
|
||||
static struct corosync_service_engine *mon_get_service_engine_ver0 (void)
|
||||
{
|
||||
return (&mon_service_engine);
|
||||
}
|
||||
|
||||
#ifdef COROSYNC_SOLARIS
|
||||
void corosync_lcr_component_register (void);
|
||||
|
||||
void corosync_lcr_component_register (void) {
|
||||
#else
|
||||
__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
|
||||
#endif
|
||||
lcr_interfaces_set (&corosync_mon_ver0[0], &mon_service_engine_iface);
|
||||
|
||||
lcr_component_register (&mon_comp_ver0);
|
||||
}
|
||||
|
||||
static const char * mon_res_state_to_str(struct cs_fsm* fsm,
|
||||
int32_t state)
|
||||
{
|
||||
switch (state) {
|
||||
case MON_S_DISABLED:
|
||||
return mon_disabled_str;
|
||||
break;
|
||||
case MON_S_OK:
|
||||
return mon_ok_str;
|
||||
break;
|
||||
case MON_S_FAILED:
|
||||
return mon_failed_str;
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const char * mon_res_event_to_str(struct cs_fsm* fsm,
|
||||
int32_t event)
|
||||
{
|
||||
switch (event) {
|
||||
case MON_E_CONFIG_CHANGED:
|
||||
return mon_config_changed_str;
|
||||
break;
|
||||
case MON_E_FAILURE:
|
||||
return mon_failure_str;
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void mon_fsm_state_set (struct cs_fsm* fsm,
|
||||
enum mon_resource_state next_state, struct resource_instance* inst)
|
||||
{
|
||||
enum mon_resource_state prev_state = fsm->curr_state;
|
||||
const char *state_str;
|
||||
|
||||
ENTER();
|
||||
|
||||
cs_fsm_state_set(fsm, next_state, inst);
|
||||
|
||||
if (prev_state == fsm->curr_state) {
|
||||
return;
|
||||
}
|
||||
state_str = mon_res_state_to_str(fsm, fsm->curr_state);
|
||||
|
||||
api->object_key_replace (inst->handle,
|
||||
"state", strlen ("state"),
|
||||
state_str, strlen (state_str));
|
||||
}
|
||||
|
||||
|
||||
static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
|
||||
{
|
||||
struct resource_instance * inst = (struct resource_instance *)data;
|
||||
char *str;
|
||||
size_t str_len;
|
||||
objdb_value_types_t type;
|
||||
int32_t tmp_value;
|
||||
int32_t res;
|
||||
|
||||
ENTER();
|
||||
|
||||
res = api->object_key_get_typed (inst->handle,
|
||||
"poll_period",
|
||||
(void**)&str, &str_len,
|
||||
&type);
|
||||
if (res == 0) {
|
||||
tmp_value = strtol (str, NULL, 0);
|
||||
if (tmp_value > 0 && tmp_value < 120) {
|
||||
if (inst->period != tmp_value) {
|
||||
inst->period = tmp_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res = api->object_key_get_typed (inst->handle, "max",
|
||||
(void**)&str, &str_len, &type);
|
||||
if (res != 0) {
|
||||
if (inst->max_type == OBJDB_VALUETYPE_INT32) {
|
||||
inst->max.int32 = INT32_MAX;
|
||||
} else
|
||||
if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
|
||||
inst->max.dbl = INT32_MAX;
|
||||
}
|
||||
mon_fsm_state_set (fsm, MON_S_DISABLED, inst);
|
||||
} else {
|
||||
if (inst->max_type == OBJDB_VALUETYPE_INT32) {
|
||||
inst->max.int32 = strtol (str, NULL, 0);
|
||||
} else
|
||||
if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
|
||||
inst->max.dbl = strtod (str, NULL);
|
||||
}
|
||||
mon_fsm_state_set (fsm, MON_S_OK, inst);
|
||||
}
|
||||
|
||||
if (mon_poll == 0) {
|
||||
return;
|
||||
}
|
||||
poll_timer_delete (mon_poll, inst->timer_handle);
|
||||
/*
|
||||
* run the updater, incase the period has shortened
|
||||
*/
|
||||
inst->update_stats_fn (inst);
|
||||
poll_timer_add (mon_poll,
|
||||
inst->period * 1000, NULL,
|
||||
inst->update_stats_fn,
|
||||
&inst->timer_handle);
|
||||
}
|
||||
|
||||
void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
|
||||
{
|
||||
struct resource_instance * inst = (struct resource_instance *)data;
|
||||
ENTER();
|
||||
mon_fsm_state_set (fsm, MON_S_FAILED, inst);
|
||||
}
|
||||
|
||||
static int32_t percent_mem_used_get(void)
|
||||
{
|
||||
#if defined(HAVE_LIBSTATGRAB)
|
||||
sg_mem_stats *mem_stats;
|
||||
sg_swap_stats *swap_stats;
|
||||
long long total, freemem;
|
||||
|
||||
mem_stats = sg_get_mem_stats();
|
||||
swap_stats = sg_get_swap_stats();
|
||||
|
||||
if (mem_stats == NULL || swap_stats != NULL) {
|
||||
log_printf (LOGSYS_LEVEL_ERROR, "Unable to get memory stats: %s\n",
|
||||
sg_str_error(sg_get_error()));
|
||||
return -1;
|
||||
}
|
||||
total = mem_stats->total + swap_stats->total;
|
||||
freemem = mem_stats->free + swap_stats->free;
|
||||
return ((total - freemem) * 100) / total;
|
||||
#else
|
||||
#if defined(COROSYNC_LINUX)
|
||||
char *line_ptr;
|
||||
char line[512];
|
||||
unsigned long long value;
|
||||
FILE *f;
|
||||
long long total = 0;
|
||||
long long freemem = 0;
|
||||
|
||||
if ((f = fopen("/proc/meminfo", "r")) == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
while ((line_ptr = fgets(line, sizeof(line), f)) != NULL) {
|
||||
if (sscanf(line_ptr, "%*s %llu kB", &value) != 1) {
|
||||
continue;
|
||||
}
|
||||
value *= 1024;
|
||||
|
||||
if (strncmp(line_ptr, "MemTotal:", 9) == 0) {
|
||||
total += value;
|
||||
} else if (strncmp(line_ptr, "MemFree:", 8) == 0) {
|
||||
freemem += value;
|
||||
} else if (strncmp(line_ptr, "SwapTotal:", 10) == 0) {
|
||||
total += value;
|
||||
} else if (strncmp(line_ptr, "SwapFree:", 9) == 0) {
|
||||
freemem += value;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
return ((total - freemem) * 100) / total;
|
||||
#else
|
||||
#error need libstatgrab or linux.
|
||||
#endif /* COROSYNC_LINUX */
|
||||
#endif /* HAVE_LIBSTATGRAB */
|
||||
}
|
||||
|
||||
|
||||
static void mem_update_stats_fn (void *data)
|
||||
{
|
||||
struct resource_instance * inst = (struct resource_instance *)data;
|
||||
int32_t new_value;
|
||||
uint64_t timestamp;
|
||||
|
||||
new_value = percent_mem_used_get();
|
||||
if (new_value > 0) {
|
||||
api->object_key_replace (inst->handle,
|
||||
"current", strlen("current"),
|
||||
&new_value, sizeof(new_value));
|
||||
|
||||
timestamp = time (NULL);
|
||||
|
||||
api->object_key_replace (inst->handle,
|
||||
"last_updated", strlen("last_updated"),
|
||||
×tamp, sizeof(time_t));
|
||||
|
||||
if (new_value > inst->max.int32) {
|
||||
cs_fsm_process (&inst->fsm, MON_E_FAILURE, inst);
|
||||
}
|
||||
}
|
||||
poll_timer_add (mon_poll,
|
||||
inst->period * 1000, inst,
|
||||
inst->update_stats_fn,
|
||||
&inst->timer_handle);
|
||||
}
|
||||
|
||||
static double min15_loadavg_get(void)
|
||||
{
|
||||
#if defined(HAVE_LIBSTATGRAB)
|
||||
sg_load_stats *load_stats;
|
||||
load_stats = sg_get_load_stats ();
|
||||
if (load_stats == NULL) {
|
||||
log_printf (LOGSYS_LEVEL_ERROR, "Unable to get load stats: %s\n",
|
||||
sg_str_error (sg_get_error()));
|
||||
return -1;
|
||||
}
|
||||
return load_stats->min15;
|
||||
#else
|
||||
#if defined(COROSYNC_LINUX)
|
||||
double loadav[3];
|
||||
if (getloadavg(loadav,3) < 0) {
|
||||
return -1;
|
||||
}
|
||||
return loadav[2];
|
||||
#else
|
||||
#error need libstatgrab or linux.
|
||||
#endif /* COROSYNC_LINUX */
|
||||
#endif /* HAVE_LIBSTATGRAB */
|
||||
}
|
||||
|
||||
static void load_update_stats_fn (void *data)
|
||||
{
|
||||
struct resource_instance * inst = (struct resource_instance *)data;
|
||||
uint64_t timestamp;
|
||||
int32_t res = 0;
|
||||
double min15 = min15_loadavg_get();
|
||||
|
||||
if (min15 < 0) {
|
||||
}
|
||||
res = api->object_key_replace (inst->handle,
|
||||
"current", strlen("current"),
|
||||
&min15, sizeof (min15));
|
||||
if (res != 0)
|
||||
log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", res);
|
||||
|
||||
timestamp = cs_timestamp_get();
|
||||
|
||||
res = api->object_key_replace (inst->handle,
|
||||
"last_updated", strlen("last_updated"),
|
||||
×tamp, sizeof(uint64_t));
|
||||
if (res != 0)
|
||||
log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: %d", res);
|
||||
|
||||
if (min15 > inst->max.dbl) {
|
||||
cs_fsm_process (&inst->fsm, MON_E_FAILURE, &inst);
|
||||
}
|
||||
|
||||
poll_timer_add (mon_poll,
|
||||
inst->period * 1000, inst,
|
||||
inst->update_stats_fn,
|
||||
&inst->timer_handle);
|
||||
}
|
||||
|
||||
static void *mon_thread_handler (void * unused)
|
||||
{
|
||||
#ifdef HAVE_LIBSTATGRAB
|
||||
sg_init();
|
||||
#endif /* HAVE_LIBSTATGRAB */
|
||||
mon_poll = poll_create ();
|
||||
|
||||
poll_timer_add (mon_poll,
|
||||
memory_used_inst.period * 1000,
|
||||
&memory_used_inst,
|
||||
memory_used_inst.update_stats_fn,
|
||||
&memory_used_inst.timer_handle);
|
||||
|
||||
poll_timer_add (mon_poll,
|
||||
load_15min_inst.period * 1000,
|
||||
&load_15min_inst,
|
||||
load_15min_inst.update_stats_fn,
|
||||
&load_15min_inst.timer_handle);
|
||||
poll_run (mon_poll);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int object_find_or_create (
|
||||
hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t *object_handle,
|
||||
const void *object_name,
|
||||
size_t object_name_len)
|
||||
{
|
||||
hdb_handle_t obj_finder;
|
||||
hdb_handle_t obj;
|
||||
int ret = -1;
|
||||
|
||||
api->object_find_create (
|
||||
parent_object_handle,
|
||||
object_name,
|
||||
object_name_len,
|
||||
&obj_finder);
|
||||
|
||||
if (api->object_find_next (obj_finder, &obj) == 0) {
|
||||
/* found it */
|
||||
*object_handle = obj;
|
||||
ret = 0;
|
||||
}
|
||||
else {
|
||||
ret = api->object_create (parent_object_handle,
|
||||
object_handle,
|
||||
object_name, object_name_len);
|
||||
}
|
||||
|
||||
api->object_find_destroy (obj_finder);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void mon_key_change_notify (object_change_type_t change_type,
|
||||
hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t object_handle,
|
||||
const void *object_name_pt, size_t object_name_len,
|
||||
const void *key_name_pt, size_t key_len,
|
||||
const void *key_value_pt, size_t key_value_len,
|
||||
void *priv_data_pt)
|
||||
{
|
||||
struct resource_instance* inst = (struct resource_instance*)priv_data_pt;
|
||||
|
||||
if ((strcmp ((char*)key_name_pt, "max") == 0) ||
|
||||
(strcmp ((char*)key_name_pt, "poll_period") == 0)) {
|
||||
ENTER();
|
||||
cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
|
||||
}
|
||||
}
|
||||
|
||||
static void mon_instance_init (hdb_handle_t parent, struct resource_instance* inst)
|
||||
{
|
||||
int32_t res;
|
||||
char mon_period_str[32];
|
||||
size_t mon_period_len;
|
||||
objdb_value_types_t mon_period_type;
|
||||
int32_t tmp_value;
|
||||
int32_t zero_32 = 0;
|
||||
time_t zero_64 = 0;
|
||||
double zero_double = 0;
|
||||
|
||||
ENTER();
|
||||
|
||||
object_find_or_create (parent,
|
||||
&inst->handle,
|
||||
inst->name, strlen (inst->name));
|
||||
|
||||
if (inst->max_type == OBJDB_VALUETYPE_INT32) {
|
||||
api->object_key_create_typed (inst->handle,
|
||||
"current", &zero_32,
|
||||
sizeof (zero_32), inst->max_type);
|
||||
} else {
|
||||
api->object_key_create_typed (inst->handle,
|
||||
"current", &zero_double,
|
||||
sizeof (zero_double), inst->max_type);
|
||||
}
|
||||
|
||||
api->object_key_create_typed (inst->handle,
|
||||
"last_updated", &zero_64,
|
||||
sizeof (time_t), OBJDB_VALUETYPE_INT64);
|
||||
|
||||
api->object_key_create_typed (inst->handle,
|
||||
"state", mon_disabled_str, strlen (mon_disabled_str),
|
||||
OBJDB_VALUETYPE_STRING);
|
||||
|
||||
inst->fsm.name = inst->name;
|
||||
inst->fsm.curr_entry = 0;
|
||||
inst->fsm.curr_state = MON_S_DISABLED;
|
||||
inst->fsm.table = mon_fsm_table;
|
||||
inst->fsm.entries = sizeof(mon_fsm_table) / sizeof(struct cs_fsm_entry);
|
||||
inst->fsm.state_to_str = mon_res_state_to_str;
|
||||
inst->fsm.event_to_str = mon_res_event_to_str;
|
||||
|
||||
res = api->object_key_get_typed (inst->handle,
|
||||
"poll_period",
|
||||
(void**)&mon_period_str, &mon_period_len,
|
||||
&mon_period_type);
|
||||
if (res != 0) {
|
||||
mon_period_len = snprintf (mon_period_str, 32, "%d",
|
||||
inst->period);
|
||||
api->object_key_create_typed (inst->handle,
|
||||
"poll_period", &mon_period_str,
|
||||
mon_period_len,
|
||||
OBJDB_VALUETYPE_STRING);
|
||||
}
|
||||
else {
|
||||
tmp_value = strtol (mon_period_str, NULL, 0);
|
||||
if (tmp_value > 0 && tmp_value < 120)
|
||||
inst->period = tmp_value;
|
||||
}
|
||||
cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
|
||||
|
||||
poll_timer_add (mon_poll,
|
||||
inst->period * 1000, inst,
|
||||
inst->update_stats_fn,
|
||||
&inst->timer_handle);
|
||||
|
||||
api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_ONE,
|
||||
mon_key_change_notify,
|
||||
NULL, NULL, NULL, NULL);
|
||||
|
||||
}
|
||||
|
||||
static int mon_exec_init_fn (
|
||||
struct corosync_api_v1 *corosync_api)
|
||||
{
|
||||
hdb_handle_t obj;
|
||||
hdb_handle_t parent;
|
||||
|
||||
#ifdef COROSYNC_SOLARIS
|
||||
logsys_subsys_init();
|
||||
#endif
|
||||
api = corosync_api;
|
||||
ENTER();
|
||||
|
||||
object_find_or_create (OBJECT_PARENT_HANDLE,
|
||||
&resources_obj,
|
||||
"resources", strlen ("resources"));
|
||||
|
||||
object_find_or_create (resources_obj,
|
||||
&obj,
|
||||
"system", strlen ("system"));
|
||||
|
||||
parent = obj;
|
||||
|
||||
mon_instance_init (parent, &memory_used_inst);
|
||||
mon_instance_init (parent, &load_15min_inst);
|
||||
|
||||
|
||||
pthread_create (&mon_poll_thread, NULL, mon_thread_handler, NULL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
755
services/wd.c
Normal file
755
services/wd.c
Normal file
@ -0,0 +1,755 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Red Hat, Inc.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Author: Angus Salkeld <asalkeld@redhat.com>
|
||||
*
|
||||
* This software licensed under BSD license, the text of which follows:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
* - Neither the name of the MontaVista Software, Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/watchdog.h>
|
||||
#include <linux/reboot.h>
|
||||
|
||||
#include <corosync/corotypes.h>
|
||||
#include <corosync/corodefs.h>
|
||||
#include <corosync/lcr/lcr_comp.h>
|
||||
#include <corosync/engine/coroapi.h>
|
||||
#include <corosync/list.h>
|
||||
#include <corosync/engine/logsys.h>
|
||||
#include "../exec/fsm.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
WD_RESOURCE_GOOD,
|
||||
WD_RESOURCE_FAILED,
|
||||
WD_RESOURCE_STATE_UNKNOWN,
|
||||
WD_RESOURCE_NOT_MONITORED
|
||||
} wd_resource_state_t;
|
||||
|
||||
struct resource {
|
||||
hdb_handle_t handle;
|
||||
char *recovery;
|
||||
char name[128];
|
||||
time_t last_updated;
|
||||
struct cs_fsm fsm;
|
||||
|
||||
corosync_timer_handle_t check_timer;
|
||||
uint32_t check_timeout;
|
||||
};
|
||||
|
||||
LOGSYS_DECLARE_SUBSYS("WD");
|
||||
|
||||
/*
|
||||
* Service Interfaces required by service_message_handler struct
|
||||
*/
|
||||
static int wd_exec_init_fn (
|
||||
struct corosync_api_v1 *corosync_api);
|
||||
static int wd_exec_exit_fn (void);
|
||||
static void wd_resource_check_fn (void* resource_ref);
|
||||
|
||||
static struct corosync_api_v1 *api;
|
||||
#define WD_DEFAULT_TIMEOUT 6
|
||||
static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT;
|
||||
static uint32_t tickle_timeout = (WD_DEFAULT_TIMEOUT / 2);
|
||||
static int dog = -1;
|
||||
static corosync_timer_handle_t wd_timer;
|
||||
static hdb_handle_t resources_obj;
|
||||
static int watchdog_ok = 1;
|
||||
|
||||
struct corosync_service_engine wd_service_engine = {
|
||||
.name = "corosync self-fencing service",
|
||||
.id = WD_SERVICE,
|
||||
.priority = 1,
|
||||
.private_data_size = 0,
|
||||
.flow_control = CS_LIB_FLOW_CONTROL_REQUIRED,
|
||||
.lib_init_fn = NULL,
|
||||
.lib_exit_fn = NULL,
|
||||
.lib_engine = NULL,
|
||||
.lib_engine_count = 0,
|
||||
.exec_engine = NULL,
|
||||
.exec_engine_count = 0,
|
||||
.confchg_fn = NULL,
|
||||
.exec_init_fn = wd_exec_init_fn,
|
||||
.exec_exit_fn = wd_exec_exit_fn,
|
||||
.exec_dump_fn = NULL,
|
||||
.sync_mode = CS_SYNC_V2
|
||||
};
|
||||
|
||||
static DECLARE_LIST_INIT (confchg_notify);
|
||||
|
||||
/*
|
||||
* F S M
|
||||
*/
|
||||
static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
|
||||
static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
|
||||
|
||||
enum wd_resource_state {
|
||||
WD_S_GOOD,
|
||||
WD_S_FAILED,
|
||||
WD_S_DISABLED
|
||||
};
|
||||
|
||||
enum wd_resource_event {
|
||||
WD_E_FAILURE,
|
||||
WD_E_CONFIG_CHANGED
|
||||
};
|
||||
|
||||
const char * wd_ok_str = "ok";
|
||||
const char * wd_failed_str = "failed";
|
||||
const char * wd_failure_str = "failure";
|
||||
const char * wd_disabled_str = "disabled";
|
||||
const char * wd_config_changed_str = "config_changed";
|
||||
|
||||
struct cs_fsm_entry wd_fsm_table[] = {
|
||||
{ WD_S_DISABLED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_DISABLED, WD_S_GOOD, -1} },
|
||||
{ WD_S_DISABLED, WD_E_FAILURE, NULL, {-1} },
|
||||
{ WD_S_GOOD, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} },
|
||||
{ WD_S_GOOD, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
|
||||
{ WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} },
|
||||
{ WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
|
||||
};
|
||||
|
||||
/*
|
||||
* Dynamic loading descriptor
|
||||
*/
|
||||
|
||||
static struct corosync_service_engine *wd_get_service_engine_ver0 (void);
|
||||
|
||||
static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = {
|
||||
.corosync_get_service_engine_ver0 = wd_get_service_engine_ver0
|
||||
};
|
||||
|
||||
static struct lcr_iface corosync_wd_ver0[1] = {
|
||||
{
|
||||
.name = "corosync_wd",
|
||||
.version = 0,
|
||||
.versions_replace = 0,
|
||||
.versions_replace_count = 0,
|
||||
.dependencies = 0,
|
||||
.dependency_count = 0,
|
||||
.constructor = NULL,
|
||||
.destructor = NULL,
|
||||
.interfaces = NULL,
|
||||
}
|
||||
};
|
||||
|
||||
static struct lcr_comp wd_comp_ver0 = {
|
||||
.iface_count = 1,
|
||||
.ifaces = corosync_wd_ver0
|
||||
};
|
||||
|
||||
static struct corosync_service_engine *wd_get_service_engine_ver0 (void)
|
||||
{
|
||||
return (&wd_service_engine);
|
||||
}
|
||||
|
||||
#ifdef COROSYNC_SOLARIS
|
||||
void corosync_lcr_component_register (void);
|
||||
|
||||
void corosync_lcr_component_register (void) {
|
||||
#else
|
||||
__attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
|
||||
#endif
|
||||
lcr_interfaces_set (&corosync_wd_ver0[0], &wd_service_engine_iface);
|
||||
|
||||
lcr_component_register (&wd_comp_ver0);
|
||||
}
|
||||
|
||||
static int object_find_or_create (
|
||||
hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t *object_handle,
|
||||
const void *object_name,
|
||||
size_t object_name_len)
|
||||
{
|
||||
hdb_handle_t obj_finder;
|
||||
hdb_handle_t obj;
|
||||
int ret = -1;
|
||||
|
||||
api->object_find_create (
|
||||
parent_object_handle,
|
||||
object_name,
|
||||
object_name_len,
|
||||
&obj_finder);
|
||||
|
||||
if (api->object_find_next (obj_finder, &obj) == 0) {
|
||||
/* found it */
|
||||
*object_handle = obj;
|
||||
ret = 0;
|
||||
}
|
||||
else {
|
||||
ret = api->object_create (parent_object_handle,
|
||||
object_handle,
|
||||
object_name, object_name_len);
|
||||
}
|
||||
|
||||
api->object_find_destroy (obj_finder);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char * wd_res_state_to_str(struct cs_fsm* fsm,
|
||||
int32_t state)
|
||||
{
|
||||
switch (state) {
|
||||
case WD_S_DISABLED:
|
||||
return wd_disabled_str;
|
||||
break;
|
||||
case WD_S_GOOD:
|
||||
return wd_ok_str;
|
||||
break;
|
||||
case WD_S_FAILED:
|
||||
return wd_failed_str;
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const char * wd_res_event_to_str(struct cs_fsm* fsm,
|
||||
int32_t event)
|
||||
{
|
||||
switch (event) {
|
||||
case WD_E_CONFIG_CHANGED:
|
||||
return wd_config_changed_str;
|
||||
break;
|
||||
case WD_E_FAILURE:
|
||||
return wd_failure_str;
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* returns (0 == OK, 1 == failed)
|
||||
*/
|
||||
static int32_t wd_resource_has_failed (struct resource *ref)
|
||||
{
|
||||
hdb_handle_t resource = ref->handle;
|
||||
int res;
|
||||
char* state;
|
||||
size_t state_len;
|
||||
objdb_value_types_t type;
|
||||
time_t *last_updated;
|
||||
time_t my_time;
|
||||
size_t last_updated_len;
|
||||
|
||||
res = api->object_key_get_typed (resource,
|
||||
"last_updated", (void*)&last_updated, &last_updated_len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
res = api->object_key_get_typed (resource,
|
||||
"state", (void**)&state, &state_len, &type);
|
||||
if (res != 0 || strncmp (state, "disabled", strlen ("disabled")) == 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
my_time = time (NULL);
|
||||
|
||||
if ((*last_updated + ref->check_timeout) < my_time) {
|
||||
log_printf (LOGSYS_LEVEL_INFO, "delayed %ld + %d < %ld",
|
||||
*last_updated, ref->check_timeout, my_time);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((*last_updated + ref->check_timeout) < my_time ||
|
||||
strcmp (state, "bad") == 0) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
|
||||
{
|
||||
int res;
|
||||
size_t len;
|
||||
char *state;
|
||||
objdb_value_types_t type;
|
||||
char mon_period_str[32];
|
||||
int32_t tmp_value;
|
||||
struct resource *ref = (struct resource*)data;
|
||||
|
||||
res = api->object_key_get_typed (ref->handle,
|
||||
"poll_period",
|
||||
(void**)&mon_period_str, &len,
|
||||
&type);
|
||||
if (res == 0) {
|
||||
tmp_value = strtol (mon_period_str, NULL, 0);
|
||||
if (tmp_value > 0 && tmp_value < 120)
|
||||
ref->check_timeout = (tmp_value * 5)/4;
|
||||
}
|
||||
|
||||
res = api->object_key_get_typed (ref->handle,
|
||||
"recovery", (void*)&ref->recovery, &len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
log_printf (LOGSYS_LEVEL_WARNING,
|
||||
"resource %s missing a recovery key.", ref->name);
|
||||
cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
|
||||
return;
|
||||
}
|
||||
res = api->object_key_get_typed (ref->handle,
|
||||
"state", (void*)&state, &len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
log_printf (LOGSYS_LEVEL_WARNING,
|
||||
"resource %s missing a state key.", ref->name);
|
||||
cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
|
||||
return;
|
||||
}
|
||||
|
||||
cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
|
||||
|
||||
if (ref->check_timer) {
|
||||
api->timer_delete(ref->check_timer);
|
||||
}
|
||||
api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
|
||||
ref,
|
||||
wd_resource_check_fn, &ref->check_timer);
|
||||
|
||||
}
|
||||
|
||||
static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
|
||||
{
|
||||
struct resource* ref = (struct resource*)data;
|
||||
|
||||
if (ref->check_timer) {
|
||||
api->timer_delete(ref->check_timer);
|
||||
}
|
||||
|
||||
log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
|
||||
ref->recovery, (char*)ref->name);
|
||||
if (strcmp (ref->recovery, "watchdog") == 0 ||
|
||||
strcmp (ref->recovery, "quit") == 0) {
|
||||
watchdog_ok = 0;
|
||||
}
|
||||
else if (strcmp (ref->recovery, "reboot") == 0) {
|
||||
//reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_RESTART, NULL);
|
||||
}
|
||||
else if (strcmp (ref->recovery, "shutdown") == 0) {
|
||||
//reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_POWER_OFF, NULL);
|
||||
}
|
||||
cs_fsm_state_set(fsm, WD_S_FAILED, data);
|
||||
}
|
||||
|
||||
static void wd_key_changed(object_change_type_t change_type,
|
||||
hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t object_handle,
|
||||
const void *object_name_pt, size_t object_name_len,
|
||||
const void *key_name_pt, size_t key_len,
|
||||
const void *key_value_pt, size_t key_value_len,
|
||||
void *priv_data_pt)
|
||||
{
|
||||
struct resource* ref = (struct resource*)priv_data_pt;
|
||||
|
||||
if (strcmp(key_name_pt, "last_updated") == 0 ||
|
||||
strcmp(key_name_pt, "current") == 0) {
|
||||
return;
|
||||
}
|
||||
// log_printf (LOGSYS_LEVEL_WARNING,
|
||||
// "watchdog resource key changed: %s.%s=%s ref=%p.",
|
||||
// (char*)object_name_pt, (char*)key_name_pt, (char*)key_value_pt, ref);
|
||||
|
||||
if (ref == NULL) {
|
||||
return;
|
||||
}
|
||||
cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref);
|
||||
}
|
||||
|
||||
static void wd_object_destroyed(
|
||||
hdb_handle_t parent_object_handle,
|
||||
const void *name_pt, size_t name_len,
|
||||
void *priv_data_pt)
|
||||
{
|
||||
struct resource* ref = (struct resource*)priv_data_pt;
|
||||
|
||||
log_printf (LOGSYS_LEVEL_WARNING,
|
||||
"watchdog resource \"%s\" deleted from objdb!",
|
||||
(char*)name_pt);
|
||||
|
||||
if (ref) {
|
||||
api->timer_delete(ref->check_timer);
|
||||
ref->check_timer = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void wd_resource_check_fn (void* resource_ref)
|
||||
{
|
||||
struct resource* ref = (struct resource*)resource_ref;
|
||||
|
||||
log_printf (LOGSYS_LEVEL_INFO,
|
||||
"checking watchdog resource \"%s\".",
|
||||
ref->name);
|
||||
if (wd_resource_has_failed (ref) ) {
|
||||
cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref);
|
||||
log_printf (LOGSYS_LEVEL_CRIT,
|
||||
"watchdog resource \"%s\" failed!",
|
||||
(char*)ref->name);
|
||||
return;
|
||||
}
|
||||
api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
|
||||
ref, wd_resource_check_fn, &ref->check_timer);
|
||||
}
|
||||
|
||||
|
||||
static void wd_resource_create (hdb_handle_t resource_obj)
|
||||
{
|
||||
int res;
|
||||
size_t len;
|
||||
char *state;
|
||||
objdb_value_types_t type;
|
||||
char mon_period_str[32];
|
||||
int32_t tmp_value;
|
||||
struct resource *ref = malloc (sizeof (struct resource));
|
||||
|
||||
ref->handle = resource_obj;
|
||||
ref->check_timeout = WD_DEFAULT_TIMEOUT;
|
||||
ref->check_timer = NULL;
|
||||
api->object_name_get (resource_obj,
|
||||
ref->name,
|
||||
&len);
|
||||
ref->name[len] = '\0';
|
||||
ref->fsm.name = ref->name;
|
||||
ref->fsm.table = wd_fsm_table;
|
||||
ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
|
||||
ref->fsm.curr_entry = 0;
|
||||
ref->fsm.curr_state = WD_S_DISABLED;
|
||||
ref->fsm.state_to_str = wd_res_state_to_str;
|
||||
ref->fsm.event_to_str = wd_res_event_to_str;
|
||||
api->object_priv_set (resource_obj, NULL);
|
||||
|
||||
res = api->object_key_get_typed (resource_obj,
|
||||
"poll_period",
|
||||
(void**)&mon_period_str, &len,
|
||||
&type);
|
||||
if (res != 0) {
|
||||
log_printf (LOGSYS_LEVEL_ERROR, "%s : %d",__func__, res);
|
||||
len = snprintf (mon_period_str, 32, "%d", ref->check_timeout);
|
||||
api->object_key_create_typed (resource_obj,
|
||||
"poll_period", &mon_period_str,
|
||||
len,
|
||||
OBJDB_VALUETYPE_STRING);
|
||||
}
|
||||
else {
|
||||
tmp_value = strtol (mon_period_str, NULL, 0);
|
||||
if (tmp_value > 0 && tmp_value < 120)
|
||||
ref->check_timeout = (tmp_value * 5)/4;
|
||||
}
|
||||
|
||||
api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_ONE,
|
||||
wd_key_changed, NULL, wd_object_destroyed,
|
||||
NULL, ref);
|
||||
|
||||
res = api->object_key_get_typed (resource_obj,
|
||||
"recovery", (void*)&ref->recovery, &len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
log_printf (LOGSYS_LEVEL_WARNING,
|
||||
"resource %s missing a recovery key.", ref->name);
|
||||
return;
|
||||
}
|
||||
res = api->object_key_get_typed (resource_obj,
|
||||
"state", (void*)&state, &len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
log_printf (LOGSYS_LEVEL_WARNING,
|
||||
"resource %s missing a state key.", ref->name);
|
||||
return;
|
||||
}
|
||||
|
||||
res = api->object_key_get_typed (resource_obj,
|
||||
"last_updated", (void*)&ref->last_updated, &len, &type);
|
||||
if (res != 0) {
|
||||
/* key does not exist.
|
||||
*/
|
||||
ref->last_updated = 0;
|
||||
}
|
||||
|
||||
api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
|
||||
ref,
|
||||
wd_resource_check_fn, &ref->check_timer);
|
||||
|
||||
cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
|
||||
}
|
||||
|
||||
|
||||
static void wd_tickle_fn (void* arg)
|
||||
{
|
||||
ENTER();
|
||||
|
||||
if (watchdog_ok) {
|
||||
if (dog > 0)
|
||||
ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
|
||||
}
|
||||
else {
|
||||
log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
|
||||
}
|
||||
|
||||
api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL,
|
||||
wd_tickle_fn, &wd_timer);
|
||||
}
|
||||
|
||||
static void wd_resource_object_created(hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t object_handle,
|
||||
const void *name_pt, size_t name_len,
|
||||
void *priv_data_pt)
|
||||
{
|
||||
wd_resource_create (object_handle);
|
||||
}
|
||||
|
||||
static void wd_scan_resources (void)
|
||||
{
|
||||
hdb_handle_t obj_finder;
|
||||
hdb_handle_t obj_finder2;
|
||||
hdb_handle_t resource_type;
|
||||
hdb_handle_t resource;
|
||||
int res;
|
||||
|
||||
ENTER();
|
||||
|
||||
api->object_find_create (
|
||||
OBJECT_PARENT_HANDLE,
|
||||
"resources", strlen ("resources"),
|
||||
&obj_finder);
|
||||
|
||||
res = api->object_find_next (obj_finder, &resources_obj);
|
||||
api->object_find_destroy (obj_finder);
|
||||
if (res != 0) {
|
||||
log_printf (LOGSYS_LEVEL_INFO, "no resources.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* this will be the system or process level
|
||||
*/
|
||||
api->object_find_create (
|
||||
resources_obj,
|
||||
NULL, 0,
|
||||
&obj_finder);
|
||||
while (api->object_find_next (obj_finder,
|
||||
&resource_type) == 0) {
|
||||
|
||||
api->object_find_create (
|
||||
resource_type,
|
||||
NULL, 0,
|
||||
&obj_finder2);
|
||||
|
||||
while (api->object_find_next (obj_finder2,
|
||||
&resource) == 0) {
|
||||
|
||||
wd_resource_create (resource);
|
||||
}
|
||||
api->object_find_destroy (obj_finder2);
|
||||
|
||||
api->object_track_start (resource_type, OBJECT_TRACK_DEPTH_ONE,
|
||||
NULL, wd_resource_object_created, NULL,
|
||||
NULL, NULL);
|
||||
}
|
||||
api->object_find_destroy (obj_finder);
|
||||
}
|
||||
|
||||
|
||||
static void watchdog_timeout_apply (uint32_t new)
|
||||
{
|
||||
struct watchdog_info ident;
|
||||
|
||||
if (new < 2) {
|
||||
watchdog_timeout = 2;
|
||||
}
|
||||
else if (new > 120) {
|
||||
watchdog_timeout = 120;
|
||||
}
|
||||
else {
|
||||
watchdog_timeout = new;
|
||||
}
|
||||
|
||||
if (dog > 0) {
|
||||
ioctl(dog, WDIOC_GETSUPPORT, &ident);
|
||||
if (ident.options & WDIOF_SETTIMEOUT) {
|
||||
/* yay! the dog is trained.
|
||||
*/
|
||||
ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout);
|
||||
}
|
||||
ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
|
||||
}
|
||||
tickle_timeout = watchdog_timeout / 2;
|
||||
|
||||
log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout);
|
||||
log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %d seconds\n", tickle_timeout);
|
||||
}
|
||||
|
||||
static int setup_watchdog(void)
|
||||
{
|
||||
struct watchdog_info ident;
|
||||
|
||||
ENTER();
|
||||
if (access ("/dev/watchdog", W_OK) != 0) {
|
||||
log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe <a watchdog>");
|
||||
dog = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* here goes, lets hope they have "Magic Close"
|
||||
*/
|
||||
dog = open("/dev/watchdog", O_WRONLY);
|
||||
|
||||
if (dog == -1) {
|
||||
log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't be opened.");
|
||||
dog = -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Right we have the dog.
|
||||
* Lets see what breed it is.
|
||||
*/
|
||||
|
||||
ioctl(dog, WDIOC_GETSUPPORT, &ident);
|
||||
log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by corosync.");
|
||||
log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
|
||||
|
||||
watchdog_timeout_apply (watchdog_timeout);
|
||||
|
||||
ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void wd_top_level_key_changed(object_change_type_t change_type,
|
||||
hdb_handle_t parent_object_handle,
|
||||
hdb_handle_t object_handle,
|
||||
const void *object_name_pt, size_t object_name_len,
|
||||
const void *key_name_pt, size_t key_len,
|
||||
const void *key_value_pt, size_t key_value_len,
|
||||
void *priv_data_pt)
|
||||
{
|
||||
uint32_t tmp_value;
|
||||
|
||||
ENTER();
|
||||
if (change_type != OBJECT_KEY_DELETED &&
|
||||
strncmp ((char*)key_name_pt, "watchdog_timeout", key_value_len) == 0) {
|
||||
tmp_value = strtol (key_value_pt, NULL, 0);
|
||||
watchdog_timeout_apply (tmp_value);
|
||||
}
|
||||
else {
|
||||
watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
|
||||
}
|
||||
log_printf (LOGSYS_LEVEL_INFO, "new(%d) tickle_timeout: %d", change_type, tickle_timeout);
|
||||
}
|
||||
|
||||
|
||||
static void watchdog_timeout_get_initial (void)
|
||||
{
|
||||
int32_t res;
|
||||
char watchdog_timeout_str[32];
|
||||
size_t watchdog_timeout_len;
|
||||
objdb_value_types_t watchdog_timeout_type;
|
||||
uint32_t tmp_value;
|
||||
|
||||
ENTER();
|
||||
|
||||
res = api->object_key_get_typed (resources_obj,
|
||||
"watchdog_timeout",
|
||||
(void**)&watchdog_timeout_str, &watchdog_timeout_len,
|
||||
&watchdog_timeout_type);
|
||||
if (res != 0) {
|
||||
watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
|
||||
|
||||
watchdog_timeout_len = snprintf (watchdog_timeout_str, 32, "%d", watchdog_timeout);
|
||||
api->object_key_create_typed (resources_obj,
|
||||
"watchdog_timeout", &watchdog_timeout_str,
|
||||
watchdog_timeout_len,
|
||||
OBJDB_VALUETYPE_STRING);
|
||||
}
|
||||
else {
|
||||
tmp_value = strtol (watchdog_timeout_str, NULL, 0);
|
||||
watchdog_timeout_apply (tmp_value);
|
||||
}
|
||||
|
||||
api->object_track_start (resources_obj, OBJECT_TRACK_DEPTH_ONE,
|
||||
wd_top_level_key_changed, NULL, NULL,
|
||||
NULL, NULL);
|
||||
|
||||
}
|
||||
|
||||
static int wd_exec_init_fn (
|
||||
struct corosync_api_v1 *corosync_api)
|
||||
{
|
||||
hdb_handle_t obj;
|
||||
|
||||
ENTER();
|
||||
#ifdef COROSYNC_SOLARIS
|
||||
logsys_subsys_init();
|
||||
#endif
|
||||
api = corosync_api;
|
||||
|
||||
object_find_or_create (OBJECT_PARENT_HANDLE,
|
||||
&resources_obj,
|
||||
"resources", strlen ("resources"));
|
||||
object_find_or_create (resources_obj,
|
||||
&obj,
|
||||
"system", strlen ("system"));
|
||||
object_find_or_create (resources_obj,
|
||||
&obj,
|
||||
"process", strlen ("process"));
|
||||
|
||||
watchdog_timeout_get_initial();
|
||||
|
||||
setup_watchdog();
|
||||
|
||||
wd_scan_resources();
|
||||
|
||||
api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL,
|
||||
wd_tickle_fn, &wd_timer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int wd_exec_exit_fn (void)
|
||||
{
|
||||
char magic = 'V';
|
||||
ENTER();
|
||||
|
||||
if (dog > 0) {
|
||||
log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog.");
|
||||
write (dog, &magic, 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user