From d99fba72e65545d8a3573b754525bd2ec8dcc540 Mon Sep 17 00:00:00 2001 From: Steven Dake Date: Fri, 18 Mar 2011 18:47:10 -0700 Subject: [PATCH] Resolve abort during simulatenous stopping of atleast 4 nodes consider 5 nodes. node 3,4 stopped (by random stopping) node 1,2,5 form new configuration and during recovery node 1 and node 2 are stopped (via service service corosync stop). This causes 5 never to finish recovery within the timeout period, triggering a token loss in recovery. Bug #623176 resolved an assert which happens because the full ring id was being restored. The resolution to Bug #623176 was to not restore the full ring id, and instead operate (according to specifications) the new ring id. Unfortunately this exposes a problem whereby the restarting of nodes 1-4 generate the same ring id. This ring id gets to the recovery failed node 5 which is now in gather, and triggers a condition not accounted for in the original totem specification. It appears later work from Dr. Agarwal's PHD dissertation considers this scenario. That solution entails rejecting the regular token in the above condition. Since the ring id is also used to make decisions for commit token acceptance, we must also take care to reject the regular token in all cases after transitioning from OPERATIONAL. Signed-off-by: Steven Dake Reviewed-by: Steven Dake --- exec/totemsrp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/exec/totemsrp.c b/exec/totemsrp.c index e29456ca..6ea5bc02 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -501,6 +501,8 @@ struct totemsrp_instance { totemsrp_stats_t stats; + uint32_t orf_token_discard; + void * token_recv_event_handle; void * token_sent_event_handle; char commit_token_storage[9000]; @@ -665,6 +667,8 @@ static void totemsrp_instance_initialize (struct totemsrp_instance *instance) instance->my_high_delivered = SEQNO_START_MSG; + instance->orf_token_discard = 0; + instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage; } @@ -1571,6 +1575,7 @@ static void timer_function_orf_token_timeout (void *data) log_printf (instance->totemsrp_log_level_debug, "The token was lost in the RECOVERY state.\n"); memb_recovery_state_token_loss (instance); + instance->orf_token_discard = 1; break; } } @@ -1823,6 +1828,8 @@ static void memb_state_gather_enter ( struct totemsrp_instance *instance, int gather_from) { + instance->orf_token_discard = 1; + memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); @@ -1953,6 +1960,8 @@ static void memb_state_recovery_enter ( log_printf (instance->totemsrp_log_level_debug, "entering RECOVERY state.\n"); + instance->orf_token_discard = 0; + instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); @@ -3365,6 +3374,9 @@ static int message_handler_orf_token ( "Time since last token %0.4f ms\n", ((float)tv_diff) / 1000000.0); #endif + if (instance->orf_token_discard) { + return (0); + } #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0);