totemsrp: Improve logging of left/down nodes

This patch from Hideo Yamauchi improves the logging of
whether nodes leave the cluster cleanly or uncleanly,
making it easier to determine if a node ws shut down
by the operator. There is also the possibility that a
LEAVE message could get missed (due to the node being
in flush state) so this can also make that clearer.

The modifications are as follows.

Change 1) I added the list which maintained LEAVE node to totemsrp.
Change 2) I added registration, a search, the handling of to clear LEAVE
node.
Change 3) I added the output to log.
Change 4) I changed an output level of the log.

Signed-off-by: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
This commit is contained in:
Christine Caulfield 2015-06-12 16:16:45 +01:00
parent 53f67a2a79
commit ab8942f626

View File

@ -316,6 +316,8 @@ struct totemsrp_instance {
struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX];
unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX];
int my_proc_list_entries;
int my_failed_list_entries;
@ -329,6 +331,8 @@ struct totemsrp_instance {
int my_deliver_memb_entries;
int my_left_memb_entries;
int my_leave_memb_entries;
struct memb_ring_id my_ring_id;
@ -513,6 +517,8 @@ struct totemsrp_instance {
uint32_t threaded_mode_enabled;
uint32_t waiting_trans_ack;
int flushing;
void * token_recv_event_handle;
void * token_sent_event_handle;
@ -1476,6 +1482,52 @@ static void memb_set_print (
}
}
#endif
static void my_leave_memb_clear(
struct totemsrp_instance *instance)
{
memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list));
instance->my_leave_memb_entries = 0;
}
static unsigned int my_leave_memb_match(
struct totemsrp_instance *instance,
unsigned int nodeid)
{
int i;
unsigned int ret = 0;
for (i = 0; i < instance->my_leave_memb_entries; i++){
if (instance->my_leave_memb_list[i] == nodeid){
ret = nodeid;
break;
}
}
return ret;
}
static void my_leave_memb_set(
struct totemsrp_instance *instance,
unsigned int nodeid)
{
int i, found = 0;
for (i = 0; i < instance->my_leave_memb_entries; i++){
if (instance->my_leave_memb_list[i] == nodeid){
found = 1;
break;
}
}
if (found == 1) {
return;
}
if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) {
instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid;
instance->my_leave_memb_entries++;
} else {
log_printf (instance->totemsrp_log_level_warning,
"Cannot set LEAVE nodeid=%d", nodeid);
}
}
static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance)
{
@ -1837,6 +1889,7 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
unsigned int res;
char left_node_msg[1024];
char joined_node_msg[1024];
char failed_node_msg[1024];
instance->originated_orf_token = 0;
@ -2008,15 +2061,30 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
if (instance->my_left_memb_entries) {
int sptr = 0;
int sptr2 = 0;
sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:");
for (i=0; i< instance->my_left_memb_entries; i++) {
sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " %u", left_list[i]);
}
for (i=0; i< instance->my_left_memb_entries; i++) {
if (my_leave_memb_match(instance, left_list[i]) == 0) {
if (sptr2 == 0) {
sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:");
}
sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " %u", left_list[i]);
}
}
if (sptr2 == 0) {
failed_node_msg[0] = '\0';
}
}
else {
left_node_msg[0] = '\0';
failed_node_msg[0] = '\0';
}
my_leave_memb_clear(instance);
log_printf (instance->totemsrp_log_level_debug,
"entering OPERATIONAL state.");
log_printf (instance->totemsrp_log_level_notice,
@ -2025,6 +2093,13 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
instance->my_ring_id.seq,
joined_node_msg,
left_node_msg);
if (strlen(failed_node_msg)) {
log_printf (instance->totemsrp_log_level_notice,
"Failed to receive the leave message.%s",
failed_node_msg);
}
instance->memb_state = MEMB_STATE_OPERATIONAL;
instance->stats.operational_entered++;
@ -3597,8 +3672,9 @@ static int message_handler_orf_token (
return (0);
}
#endif
instance->flushing = 1;
totemrrp_recv_flush (instance->totemrrp_context);
instance->flushing = 0;
/*
* Determine if we should hold (in reality drop) the token
@ -4130,6 +4206,32 @@ static void memb_join_process (
memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries);
-*/
if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) {
if (instance->flushing) {
if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
log_printf (instance->totemsrp_log_level_warning,
"Discarding LEAVE message during flush, nodeid=%u",
memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
if (memb_join->failed_list_entries > 0) {
my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
}
} else {
log_printf (instance->totemsrp_log_level_warning,
"Discarding JOIN message during flush, nodeid=%d", memb_join->header.nodeid);
}
return;
} else {
if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
log_printf (instance->totemsrp_log_level_debug,
"Recieve LEAVE message from %u", memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
if (memb_join->failed_list_entries > 0) {
my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
}
}
}
}
if (memb_set_equal (proc_list,
memb_join->proc_list_entries,
instance->my_proc_list,
@ -4573,6 +4675,7 @@ void main_deliver_fn (
return;
}
switch (message_header->type) {
case MESSAGE_TYPE_ORF_TOKEN:
instance->stats.orf_token_rx++;