mirror of
https://git.proxmox.com/git/mirror_corosync
synced 2025-07-27 03:13:34 +00:00
CTS: add a test case for Andrew's bug
Bug: sometimes shutdown can take a long time if all nodes are shutdown together. git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@2994 fd59a12c-fef9-0310-b244-a6a79926bd2f
This commit is contained in:
parent
b10fb56e8e
commit
c77d601667
@ -89,7 +89,7 @@ let common_logging =
|
||||
|kv "logfile_priority" /alert|crit|debug|emerg|err|info|notice|warning/
|
||||
|kv "syslog_priority" /alert|crit|debug|emerg|err|info|notice|warning/
|
||||
|kv "syslog_facility" /daemon|local0|local1|local2|local3|local4|local5|local6|local7/
|
||||
|qstr /logfile|tags/
|
||||
|qstr /logfile|trace/
|
||||
|
||||
(* A logger_subsys subsection *)
|
||||
let logger_subsys =
|
||||
|
@ -58,7 +58,7 @@ endif
|
||||
noinst_HEADERS = common_test_agent.h
|
||||
|
||||
cpg_test_agent_SOURCES = cpg_test_agent.c common_test_agent.c
|
||||
cpg_test_agent_LDADD = -lcpg -lcoroipcc ../../exec/coropoll.o ../../exec/crypto.o
|
||||
cpg_test_agent_LDADD = -lcpg -lcfg -lcoroipcc ../../exec/coropoll.o ../../exec/crypto.o
|
||||
cpg_test_agent_LDFLAGS = -L../../lib -L.
|
||||
|
||||
confdb_test_agent_SOURCES = confdb_test_agent.c common_test_agent.c
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include <corosync/totem/coropoll.h>
|
||||
#include <corosync/list.h>
|
||||
#include <corosync/cpg.h>
|
||||
#include <corosync/cfg.h>
|
||||
#include "../../exec/crypto.h"
|
||||
#include "common_test_agent.h"
|
||||
|
||||
@ -82,7 +83,9 @@ static char big_and_buf[HOW_BIG_AND_BUF];
|
||||
static int32_t record_config_events_g = 0;
|
||||
static int32_t record_messages_g = 0;
|
||||
static cpg_handle_t cpg_handle = 0;
|
||||
static corosync_cfg_handle_t cfg_handle = 0;
|
||||
static int32_t cpg_fd = -1;
|
||||
static int32_t cfg_fd = -1;
|
||||
static struct list_head config_chg_log_head;
|
||||
static struct list_head msg_log_head;
|
||||
static pid_t my_pid;
|
||||
@ -91,7 +94,8 @@ static int32_t my_seq;
|
||||
static int32_t use_zcb = 0;
|
||||
static int32_t my_msgs_to_send;
|
||||
static int32_t total_stored_msgs = 0;
|
||||
|
||||
static int32_t in_cnchg = 0;
|
||||
static int32_t pcmk_test = 0;
|
||||
|
||||
static void send_some_more_messages (void * unused);
|
||||
|
||||
@ -171,6 +175,11 @@ static void delivery_callback (
|
||||
err_status_string (status_buf, 20, status));
|
||||
list_add_tail (&log_pt->list, &msg_log_head);
|
||||
total_stored_msgs++;
|
||||
|
||||
// if ((total_stored_msgs % 100) == 0) {
|
||||
// syslog (LOG_INFO, "%s(); %d", __func__, total_stored_msgs);
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
static void config_change_callback (
|
||||
@ -209,8 +218,27 @@ static void config_change_callback (
|
||||
list_add_tail (&log_pt->list, &config_chg_log_head);
|
||||
}
|
||||
}
|
||||
if (pcmk_test == 1) {
|
||||
in_cnchg = 1;
|
||||
send_some_more_messages (NULL);
|
||||
in_cnchg = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void my_shutdown_callback (corosync_cfg_handle_t handle,
|
||||
corosync_cfg_shutdown_flags_t flags)
|
||||
{
|
||||
syslog (LOG_CRIT, "%s flags:%d", __func__, flags);
|
||||
if (flags & COROSYNC_CFG_SHUTDOWN_FLAG_REQUEST) {
|
||||
corosync_cfg_replyto_shutdown (cfg_handle, COROSYNC_CFG_SHUTDOWN_FLAG_YES);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static corosync_cfg_callbacks_t cfg_callbacks = {
|
||||
.corosync_cfg_shutdown_callback = my_shutdown_callback,
|
||||
.corosync_cfg_state_track_callback = NULL,
|
||||
};
|
||||
static cpg_callbacks_t callbacks = {
|
||||
.cpg_deliver_fn = delivery_callback,
|
||||
.cpg_confchg_fn = config_change_callback,
|
||||
@ -349,8 +377,10 @@ static void send_some_more_messages_zcb (void)
|
||||
if (res == CS_ERR_TRY_AGAIN) {
|
||||
/* lets do this later */
|
||||
send_some_more_messages_later ();
|
||||
syslog (LOG_INFO, "%s() cpg_mcast_joined() says try again.",
|
||||
__func__);
|
||||
// if (i > 0) {
|
||||
// syslog (LOG_INFO, "%s() TRY_AGAIN %d to send.",
|
||||
// __func__, my_msgs_to_send);
|
||||
// }
|
||||
goto free_buffer;
|
||||
} else if (res != CS_OK) {
|
||||
syslog (LOG_ERR, "%s() -> cpg_mcast_joined error:%d, exiting.",
|
||||
@ -365,6 +395,13 @@ free_buffer:
|
||||
}
|
||||
|
||||
|
||||
#define cs_repeat(counter, max, code) do { \
|
||||
code; \
|
||||
if(res == CS_ERR_TRY_AGAIN) { \
|
||||
counter++; \
|
||||
sleep(counter); \
|
||||
} \
|
||||
} while(res == CS_ERR_TRY_AGAIN && counter < max)
|
||||
|
||||
static unsigned char buffer[200000];
|
||||
static void send_some_more_messages_normal (void)
|
||||
@ -377,6 +414,8 @@ static void send_some_more_messages_normal (void)
|
||||
hash_state sha1_hash;
|
||||
cs_error_t res;
|
||||
cpg_flow_control_state_t fc_state;
|
||||
int retries = 0;
|
||||
time_t before;
|
||||
|
||||
if (cpg_fd < 0)
|
||||
return;
|
||||
@ -402,29 +441,43 @@ static void send_some_more_messages_normal (void)
|
||||
iov[1].iov_base = buffer;
|
||||
|
||||
for (i = 0; i < send_now; i++) {
|
||||
|
||||
res = cpg_flow_control_state_get (cpg_handle, &fc_state);
|
||||
if (res == CS_OK && fc_state == CPG_FLOW_CONTROL_ENABLED) {
|
||||
/* lets do this later */
|
||||
send_some_more_messages_later ();
|
||||
syslog (LOG_INFO, "%s() flow control enabled.", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
res = cpg_mcast_joined (cpg_handle, CPG_TYPE_AGREED, iov, 2);
|
||||
if (res == CS_ERR_TRY_AGAIN) {
|
||||
/* lets do this later */
|
||||
send_some_more_messages_later ();
|
||||
syslog (LOG_INFO, "%s() cpg_mcast_joined() says try again.",
|
||||
__func__);
|
||||
return;
|
||||
} else
|
||||
if (in_cnchg && pcmk_test) {
|
||||
retries = 0;
|
||||
before = time(NULL);
|
||||
cs_repeat(retries, 30, res = cpg_mcast_joined(cpg_handle, CPG_TYPE_AGREED, iov, 1));
|
||||
if (retries > 20) {
|
||||
syslog (LOG_ERR, "%s() -> cs_repeat: blocked for :%lu secs.",
|
||||
__func__, (unsigned long)(time(NULL) - before));
|
||||
}
|
||||
if (res != CS_OK) {
|
||||
syslog (LOG_ERR, "%s() -> cpg_mcast_joined error:%d.",
|
||||
__func__, res);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
res = cpg_flow_control_state_get (cpg_handle, &fc_state);
|
||||
if (res == CS_OK && fc_state == CPG_FLOW_CONTROL_ENABLED) {
|
||||
/* lets do this later */
|
||||
send_some_more_messages_later ();
|
||||
syslog (LOG_INFO, "%s() flow control enabled.", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
res = cpg_mcast_joined (cpg_handle, CPG_TYPE_AGREED, iov, 2);
|
||||
if (res == CS_ERR_TRY_AGAIN) {
|
||||
/* lets do this later */
|
||||
send_some_more_messages_later ();
|
||||
if (i > 0) {
|
||||
syslog (LOG_INFO, "%s() TRY_AGAIN %d to send.",
|
||||
__func__, my_msgs_to_send);
|
||||
}
|
||||
return;
|
||||
} else if (res != CS_OK) {
|
||||
syslog (LOG_ERR, "%s() -> cpg_mcast_joined error:%d, exiting.",
|
||||
__func__, res);
|
||||
exit (-2);
|
||||
}
|
||||
|
||||
}
|
||||
my_msgs_to_send--;
|
||||
}
|
||||
}
|
||||
@ -445,6 +498,7 @@ static void msg_blaster (int sock, char* num_to_send_str)
|
||||
my_pid = getpid();
|
||||
|
||||
use_zcb = 0;
|
||||
total_stored_msgs = 0;
|
||||
|
||||
cpg_local_get (cpg_handle, &my_nodeid);
|
||||
|
||||
@ -481,6 +535,7 @@ static void msg_blaster_zcb (int sock, char* num_to_send_str)
|
||||
my_pid = getpid();
|
||||
|
||||
use_zcb = 1;
|
||||
total_stored_msgs = 0;
|
||||
|
||||
cpg_local_get (cpg_handle, &my_nodeid);
|
||||
|
||||
@ -493,18 +548,52 @@ static void msg_blaster_zcb (int sock, char* num_to_send_str)
|
||||
send_some_more_messages_zcb ();
|
||||
}
|
||||
|
||||
static corosync_cfg_state_notification_t notification_buffer;
|
||||
|
||||
static int cfg_dispatch_wrapper_fn (hdb_handle_t handle,
|
||||
int fd,
|
||||
int revents,
|
||||
void *data)
|
||||
{
|
||||
cs_error_t error;
|
||||
if (revents & POLLHUP || revents & POLLERR) {
|
||||
syslog (LOG_ERR, "%s() got POLLHUP disconnecting from CFG", __func__);
|
||||
poll_dispatch_delete (ta_poll_handle_get(), cfg_fd);
|
||||
close (cfg_fd);
|
||||
cfg_fd = -1;
|
||||
return -1;
|
||||
}
|
||||
error = corosync_cfg_dispatch (cfg_handle, CS_DISPATCH_ALL);
|
||||
if (error == CS_ERR_LIBRARY) {
|
||||
syslog (LOG_ERR, "%s() got LIB error disconnecting from CFG.", __func__);
|
||||
poll_dispatch_delete (ta_poll_handle_get(), cfg_fd);
|
||||
close (cfg_fd);
|
||||
cfg_fd = -1;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpg_dispatch_wrapper_fn (hdb_handle_t handle,
|
||||
int fd,
|
||||
int revents,
|
||||
void *data)
|
||||
{
|
||||
cs_error_t error = cpg_dispatch (cpg_handle, CS_DISPATCH_ALL);
|
||||
if (error == CS_ERR_LIBRARY) {
|
||||
syslog (LOG_ERR, "%s() got LIB error disconnecting from corosync.", __func__);
|
||||
cs_error_t error;
|
||||
if (revents & POLLHUP || revents & POLLERR) {
|
||||
syslog (LOG_ERR, "%s() got POLLHUP disconnecting from CPG", __func__);
|
||||
poll_dispatch_delete (ta_poll_handle_get(), cpg_fd);
|
||||
close (cpg_fd);
|
||||
cpg_fd = -1;
|
||||
return -1;
|
||||
}
|
||||
error = cpg_dispatch (cpg_handle, CS_DISPATCH_ALL);
|
||||
if (error == CS_ERR_LIBRARY) {
|
||||
syslog (LOG_ERR, "%s() got LIB error disconnecting from CPG", __func__);
|
||||
poll_dispatch_delete (ta_poll_handle_get(), cpg_fd);
|
||||
close (cpg_fd);
|
||||
cpg_fd = -1;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -595,6 +684,8 @@ static void do_command (int sock, char* func, char*args[], int num_args)
|
||||
read_messages (sock, args[0]);
|
||||
} else if (strcmp ("msg_blaster_zcb", func) == 0) {
|
||||
msg_blaster_zcb (sock, args[0]);
|
||||
} else if (strcmp ("pcmk_test", func) == 0) {
|
||||
pcmk_test = 1;
|
||||
} else if (strcmp ("msg_blaster", func) == 0) {
|
||||
msg_blaster (sock, args[0]);
|
||||
} else if (strcmp ("context_test", func) == 0) {
|
||||
@ -602,6 +693,33 @@ static void do_command (int sock, char* func, char*args[], int num_args)
|
||||
} else if (strcmp ("are_you_ok_dude", func) == 0) {
|
||||
snprintf (response, 100, "%s", OK_STR);
|
||||
send (sock, response, strlen (response), 0);
|
||||
|
||||
} else if (strcmp ("cfg_shutdown", func) == 0) {
|
||||
|
||||
corosync_cfg_try_shutdown (cfg_handle, COROSYNC_CFG_SHUTDOWN_FLAG_REQUEST);
|
||||
|
||||
} else if (strcmp ("cfg_initialize",func) == 0) {
|
||||
int retry_count = 0;
|
||||
|
||||
syslog (LOG_INFO,"%s %s() called!", __func__, func);
|
||||
result = corosync_cfg_initialize (&cfg_handle, &cfg_callbacks);
|
||||
while (result != CS_OK) {
|
||||
syslog (LOG_ERR,
|
||||
"cfg_initialize error %d (attempt %d)\n",
|
||||
result, retry_count);
|
||||
if (retry_count >= 3) {
|
||||
exit (1);
|
||||
}
|
||||
sleep(1);
|
||||
retry_count++;
|
||||
result = corosync_cfg_initialize (&cfg_handle, &cfg_callbacks);
|
||||
}
|
||||
|
||||
corosync_cfg_fd_get (cfg_handle, &cfg_fd);
|
||||
|
||||
corosync_cfg_state_track (cfg_handle, 0, ¬ification_buffer);
|
||||
|
||||
poll_dispatch_add (ta_poll_handle_get(), cfg_fd, POLLIN|POLLNVAL, NULL, cfg_dispatch_wrapper_fn);
|
||||
} else {
|
||||
syslog (LOG_ERR,"%s RPC:%s not supported!", __func__, func);
|
||||
}
|
||||
|
@ -79,6 +79,7 @@ class CoroTest(CTSTest):
|
||||
if self.need_all_up and self.CM.start_cpg:
|
||||
self.CM.cpg_agent[n].clean_start()
|
||||
self.CM.cpg_agent[n].cpg_join(self.name)
|
||||
self.CM.cpg_agent[n].cfg_initialize()
|
||||
if not self.need_all_up and self.CM.StataCM(n):
|
||||
self.incr("stopped")
|
||||
self.stop(n)
|
||||
@ -776,8 +777,6 @@ class QuorumState(object):
|
||||
|
||||
###################################################################
|
||||
class VoteQuorumBase(CoroTest):
|
||||
'''
|
||||
'''
|
||||
|
||||
def setup(self, node):
|
||||
ret = CoroTest.setup(self, node)
|
||||
@ -796,8 +795,9 @@ class VoteQuorumBase(CoroTest):
|
||||
def config_valid(self, config):
|
||||
if config.has_key('totem/rrp_mode'):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
if config.has_key('quorum/provider'):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
###################################################################
|
||||
@ -849,7 +849,7 @@ class VoteQuorumGoDown(VoteQuorumBase):
|
||||
self.failure('unexpected number of expected_votes')
|
||||
|
||||
if state.total_votes != nodes_alive:
|
||||
self.failure('unexpected number of total votes')
|
||||
self.failure('unexpected number of total votes:%d, nodes_alive:%d', (state.total_votes, nodes_alive))
|
||||
|
||||
min = ((len(self.CM.Env["nodes"]) + 2) / 2)
|
||||
if min != state.quorum:
|
||||
@ -1019,9 +1019,58 @@ class GenSimulStop(CoroTest):
|
||||
return self.success()
|
||||
|
||||
|
||||
###################################################################
|
||||
class GenStopAllBeekhof(CoroTest):
|
||||
'''Stop all the nodes ~ simultaneously'''
|
||||
|
||||
def __init__(self, cm):
|
||||
CoroTest.__init__(self,cm)
|
||||
self.name="GenStopAllBeekhof"
|
||||
self.need_all_up = True
|
||||
|
||||
def __call__(self, node):
|
||||
'''Perform the 'GenStopAllBeekhof' test. '''
|
||||
self.incr("calls")
|
||||
|
||||
stopping = int(time.time())
|
||||
for n in self.CM.Env["nodes"]:
|
||||
self.CM.cpg_agent[n].pcmk_test()
|
||||
self.CM.cpg_agent[n].msg_blaster(10000)
|
||||
self.CM.cpg_agent[n].cfg_shutdown()
|
||||
self.CM.ShouldBeStatus[n] = "down"
|
||||
|
||||
waited = 0
|
||||
max_wait = 60
|
||||
|
||||
still_up = list(self.CM.Env["nodes"])
|
||||
while len(still_up) > 0:
|
||||
waited = int(time.time()) - stopping
|
||||
self.CM.log("%s still up %s; waited %d secs" % (self.name, str(still_up), waited))
|
||||
if waited > max_wait:
|
||||
break
|
||||
time.sleep(3)
|
||||
for v in self.CM.Env["nodes"]:
|
||||
if v in still_up:
|
||||
self.CM.ShouldBeStatus[n] = "down"
|
||||
if not self.CM.StataCM(v):
|
||||
still_up.remove(v)
|
||||
|
||||
waited = int(time.time()) - stopping
|
||||
if waited > max_wait:
|
||||
for v in still_up:
|
||||
self.CM.log("%s killing corosync on %s" % (self.name, v))
|
||||
self.CM.rsh(v, 'killall -SIGSEGV corosync cpg_test_agent')
|
||||
return self.failure("Waited %d secs for nodes: %s to stop" % (waited, str(still_up)))
|
||||
|
||||
self.CM.log("%s ALL good (waited %d secs)" % (self.name, waited))
|
||||
return self.success()
|
||||
|
||||
|
||||
|
||||
GenTestClasses = []
|
||||
GenTestClasses.append(GenSimulStart)
|
||||
GenTestClasses.append(GenSimulStop)
|
||||
GenTestClasses.append(GenStopAllBeekhof)
|
||||
GenTestClasses.append(CpgMsgOrderBasic)
|
||||
GenTestClasses.append(CpgMsgOrderZcb)
|
||||
GenTestClasses.append(CpgCfgChgOnExecCrash)
|
||||
@ -1083,19 +1132,38 @@ def CoroTestList(cm, audits):
|
||||
a = ConfigContainer('none_5min')
|
||||
a['compatibility'] = 'none'
|
||||
a['totem/token'] = (5 * 60 * 1000)
|
||||
a['totem/consensus'] = int(5 * 60 * 1000 * 1.2) + 1
|
||||
configs.append(a)
|
||||
|
||||
b = ConfigContainer('whitetank_5min')
|
||||
b = ConfigContainer('pcmk_basic')
|
||||
b['compatibility'] = 'whitetank'
|
||||
b['totem/token'] = (5 * 60 * 1000)
|
||||
b['totem/token'] = 5000
|
||||
b['totem/token_retransmits_before_loss_const'] = 10
|
||||
b['totem/join'] = 1000
|
||||
b['totem/consensus'] = 7500
|
||||
configs.append(b)
|
||||
|
||||
c = ConfigContainer('sec_nss')
|
||||
c = ConfigContainer('pcmk_sec_nss')
|
||||
c['totem/secauth'] = 'on'
|
||||
c['totem/crypto_accept'] = 'new'
|
||||
c['totem/crypto_type'] = 'nss'
|
||||
c['totem/token'] = 5000
|
||||
c['totem/token_retransmits_before_loss_const'] = 10
|
||||
c['totem/join'] = 1000
|
||||
c['totem/consensus'] = 7500
|
||||
configs.append(c)
|
||||
|
||||
s = ConfigContainer('pcmk_vq')
|
||||
s['quorum/provider'] = 'corosync_votequorum'
|
||||
s['quorum/expected_votes'] = len(cm.Env["nodes"])
|
||||
s['totem/token'] = 5000
|
||||
s['totem/token_retransmits_before_loss_const'] = 10
|
||||
s['totem/join'] = 1000
|
||||
s['totem/vsftype'] = 'none'
|
||||
s['totem/consensus'] = 7500
|
||||
s['totem/max_messages'] = 20
|
||||
configs.append(s)
|
||||
|
||||
d = ConfigContainer('sec_sober')
|
||||
d['totem/secauth'] = 'on'
|
||||
d['totem/crypto_type'] = 'sober'
|
||||
@ -1105,11 +1173,6 @@ def CoroTestList(cm, audits):
|
||||
e['totem/threads'] = 4
|
||||
configs.append(e)
|
||||
|
||||
#quorum/provider=
|
||||
#f = {}
|
||||
#f['quorum/provider'] = 'corosync_quorum_ykd'
|
||||
#configs.append(f)
|
||||
|
||||
if not cm.Env["RrpBindAddr"] is None:
|
||||
g = ConfigContainer('rrp_passive')
|
||||
g['totem/rrp_mode'] = 'passive'
|
||||
|
Loading…
Reference in New Issue
Block a user