From a8b7513df9050777a31dd4ebc7b82c82f60d2721 Mon Sep 17 00:00:00 2001 From: Jan Friesse Date: Wed, 18 Nov 2020 14:31:01 +0100 Subject: [PATCH] qnetd: Improve dead peer detection Previously dead peer detection timer was scheduled every dpd_interval, added dpd_interval to all of the clients timestamp and if timestamp was larger than client hearbeat interval * 1.2 then check if client sent some message. If so, flag was reset. This method was source of number of problems so instead different method is now used. Every single client has its own timer with timeout based on (configurable) dpd_interval_coefficient and multiplied with client heartbeat timeout. When message is received from client timer is rescheduled. When timer callback is called (= client doesn't sent message during timeout) then client is disconnected. Signed-off-by: Jan Friesse --- man/corosync-qnetd.8 | 7 +- qdevices/Makefile.am | 2 +- qdevices/corosync-qnetd.c | 3 + qdevices/qnet-config.h | 5 +- qdevices/qnetd-advanced-settings.c | 11 ++- qdevices/qnetd-advanced-settings.h | 2 +- ...d-dpd-timer.c => qnetd-client-dpd-timer.c} | 83 +++++++++++-------- ...d-dpd-timer.h => qnetd-client-dpd-timer.h} | 20 +++-- qdevices/qnetd-client-msg-received.c | 16 +++- qdevices/qnetd-client-net.c | 21 ++++- qdevices/qnetd-client.c | 4 + qdevices/qnetd-client.h | 3 +- qdevices/qnetd-instance.c | 10 +-- qdevices/qnetd-instance.h | 1 - 14 files changed, 127 insertions(+), 61 deletions(-) rename qdevices/{qnetd-dpd-timer.c => qnetd-client-dpd-timer.c} (54%) rename qdevices/{qnetd-dpd-timer.h => qnetd-client-dpd-timer.h} (74%) diff --git a/man/corosync-qnetd.8 b/man/corosync-qnetd.8 index 7a108e8..790ee9d 100644 --- a/man/corosync-qnetd.8 +++ b/man/corosync-qnetd.8 @@ -31,7 +31,7 @@ .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF .\" * THE POSSIBILITY OF SUCH DAMAGE. .\" */ -.TH COROSYNC-QNETD 8 2020-09-22 +.TH COROSYNC-QNETD 8 2020-11-18 .SH NAME corosync-qnetd \- QNet daemon .SH SYNOPSIS @@ -216,8 +216,9 @@ Maximum heartbeat timeout accepted by server in ms. (120000) .B dpd_enabled Dead peer detection enabled. (on) .TP -.B dpd_interval -How often the DPD algorithm detects dead peers in ms. (1000) +.B dpd_interval_coefficient +Value is multiplied with heartbeat interval sent by qdevice client and used as a timeout +for dead peer detection. (1.5) .TP .B lock_file Lock file location. (/var/run/corosync-qnetd/corosync-qnetd.pid) diff --git a/qdevices/Makefile.am b/qdevices/Makefile.am index 96e8508..de9d097 100644 --- a/qdevices/Makefile.am +++ b/qdevices/Makefile.am @@ -64,7 +64,7 @@ corosync_qnetd_SOURCES = corosync-qnetd.c \ qnetd-client-msg-received.c qnetd-client-msg-received.h \ qnetd-log-debug.c qnetd-log-debug.h \ qnetd-client-algo-timer.c qnetd-client-algo-timer.h \ - qnetd-dpd-timer.c qnetd-dpd-timer.h \ + qnetd-client-dpd-timer.c qnetd-client-dpd-timer.h \ qnetd-ipc.c qnetd-ipc.h unix-socket-ipc.c unix-socket-ipc.h \ dynar-simple-lex.c dynar-simple-lex.h dynar-str.c dynar-str.h \ unix-socket-client.c unix-socket-client.h \ diff --git a/qdevices/corosync-qnetd.c b/qdevices/corosync-qnetd.c index eede5d1..0d7f764 100644 --- a/qdevices/corosync-qnetd.c +++ b/qdevices/corosync-qnetd.c @@ -237,6 +237,9 @@ cli_parse_long_opt(struct qnetd_advanced_settings *advanced_settings, const char case -2: errx(EXIT_FAILURE, "Invalid value '%s' for option '%s'", val, opt); break; + case -3: + warnx("Option '%s' is deprecated and has no effect anymore", opt); + break; } } diff --git a/qdevices/qnet-config.h b/qdevices/qnet-config.h index 2b070de..544c0c4 100644 --- a/qdevices/qnet-config.h +++ b/qdevices/qnet-config.h @@ -71,8 +71,9 @@ extern "C" { #define QNETD_MIN_HEARTBEAT_INTERVAL 1 #define QNETD_DEFAULT_DPD_ENABLED 1 -#define QNETD_DEFAULT_DPD_INTERVAL (1*1000) -#define QNETD_MIN_DPD_INTERVAL 1 +#define QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT 1.5 +#define QNETD_MIN_DPD_INTERVAL_COEFFICIENT 1 +#define QNETD_MAX_DPD_INTERVAL_COEFFICIENT 1000 #define QNETD_DEFAULT_LOCK_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.pid" #define QNETD_DEFAULT_LOCAL_SOCKET_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.sock" diff --git a/qdevices/qnetd-advanced-settings.c b/qdevices/qnetd-advanced-settings.c index cd5c5f6..78241c8 100644 --- a/qdevices/qnetd-advanced-settings.c +++ b/qdevices/qnetd-advanced-settings.c @@ -63,7 +63,7 @@ qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings) settings->heartbeat_interval_min = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MIN; settings->heartbeat_interval_max = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX; settings->dpd_enabled = QNETD_DEFAULT_DPD_ENABLED; - settings->dpd_interval = QNETD_DEFAULT_DPD_INTERVAL; + settings->dpd_interval_coefficient = QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT; if ((settings->lock_file = strdup(QNETD_DEFAULT_LOCK_FILE)) == NULL) { return (-1); } @@ -94,12 +94,14 @@ qnetd_advanced_settings_destroy(struct qnetd_advanced_settings *settings) * 0 - No error * -1 - Unknown option * -2 - Incorrect value + * -3 - Deprecated value */ int qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings, const char *option, const char *value) { long long int tmpll; + double tmpdbl; if (strcasecmp(option, "listen_backlog") == 0) { if (utils_strtonum(value, QNETD_MIN_LISTEN_BACKLOG, INT_MAX, &tmpll) == -1) { @@ -156,11 +158,14 @@ qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings, settings->dpd_enabled = (uint8_t)tmpll; } else if (strcasecmp(option, "dpd_interval") == 0) { - if (utils_strtonum(value, QNETD_MIN_DPD_INTERVAL, UINT32_MAX, &tmpll) == -1) { + return (-3); + } else if (strcasecmp(option, "dpd_interval_coefficient") == 0) { + if (utils_strtod(value, QNETD_MIN_DPD_INTERVAL_COEFFICIENT, + QNETD_MAX_DPD_INTERVAL_COEFFICIENT, &tmpdbl) == -1) { return (-2); } - settings->dpd_interval = (uint32_t)tmpll; + settings->dpd_interval_coefficient = tmpdbl; } else if (strcasecmp(option, "lock_file") == 0) { free(settings->lock_file); diff --git a/qdevices/qnetd-advanced-settings.h b/qdevices/qnetd-advanced-settings.h index 740d2b7..bbe9acf 100644 --- a/qdevices/qnetd-advanced-settings.h +++ b/qdevices/qnetd-advanced-settings.h @@ -49,7 +49,6 @@ struct qnetd_advanced_settings { uint32_t heartbeat_interval_min; uint32_t heartbeat_interval_max; uint8_t dpd_enabled; - uint32_t dpd_interval; char *lock_file; char *local_socket_file; int local_socket_backlog; @@ -57,6 +56,7 @@ struct qnetd_advanced_settings { size_t ipc_max_send_size; size_t ipc_max_receive_size; enum tlv_keep_active_partition_tie_breaker keep_active_partition_tie_breaker; + double dpd_interval_coefficient; }; extern int qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings); diff --git a/qdevices/qnetd-dpd-timer.c b/qdevices/qnetd-client-dpd-timer.c similarity index 54% rename from qdevices/qnetd-dpd-timer.c rename to qdevices/qnetd-client-dpd-timer.c index 1917982..7b31ffe 100644 --- a/qdevices/qnetd-dpd-timer.c +++ b/qdevices/qnetd-client-dpd-timer.c @@ -33,53 +33,42 @@ */ #include "log.h" -#include "qnetd-dpd-timer.h" +#include "qnetd-client-dpd-timer.h" static int qnetd_dpd_timer_cb(void *data1, void *data2) { - struct qnetd_instance *instance; struct qnetd_client *client; - instance = (struct qnetd_instance *)data1; + client = (struct qnetd_client *)data1; - TAILQ_FOREACH(client, &instance->clients, entries) { - if (!client->init_received) { - continue; - } + log(LOG_WARNING, "Client %s doesn't sent any message during " + "%" PRIu32 "ms. Disconnecting", + client->addr_str, + timer_list_entry_get_interval(client->dpd_timer)); - client->dpd_time_since_last_check += instance->advanced_settings->dpd_interval; + client->schedule_disconnect = 1; + /* + * Timer gets removed by timer-list because of returning 0 + */ + client->dpd_timer = NULL; - if (client->dpd_time_since_last_check > client->heartbeat_interval * 1.2) { - if (!client->dpd_msg_received_since_last_check) { - log(LOG_WARNING, "Client %s doesn't sent any message during " - "%"PRIu32"ms. Disconnecting", - client->addr_str, client->dpd_time_since_last_check); - - client->schedule_disconnect = 1; - } else { - client->dpd_time_since_last_check = 0; - client->dpd_msg_received_since_last_check = 0; - } - } - } - - return (-1); + return (0); } int -qnetd_dpd_timer_init(struct qnetd_instance *instance) +qnetd_client_dpd_timer_init(struct qnetd_instance *instance, struct qnetd_client *client) { if (!instance->advanced_settings->dpd_enabled) { return (0); } - instance->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop), - instance->advanced_settings->dpd_interval, - qnetd_dpd_timer_cb, (void *)instance, NULL); - if (instance->dpd_timer == NULL) { - log(LOG_ERR, "Can't initialize dpd timer"); + client->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop), + (PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval), + qnetd_dpd_timer_cb, (void *)client, NULL); + if (client->dpd_timer == NULL) { + log(LOG_ERR, "Can't initialize dpd timer for client %s", client->addr_str); return (-1); } @@ -88,11 +77,39 @@ qnetd_dpd_timer_init(struct qnetd_instance *instance) } void -qnetd_dpd_timer_destroy(struct qnetd_instance *instance) +qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance, struct qnetd_client *client) { - if (instance->dpd_timer != NULL) { - timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop), instance->dpd_timer); - instance->dpd_timer = NULL; + if (client->dpd_timer != NULL) { + timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop), + client->dpd_timer); + + client->dpd_timer = NULL; } } + +void +qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance, struct qnetd_client *client) +{ + + if (client->dpd_timer != NULL) { + timer_list_entry_reschedule(pr_poll_loop_get_timer_list(&instance->main_poll_loop), + client->dpd_timer); + } +} + +int +qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance, struct qnetd_client *client) +{ + int res; + + if (client->dpd_timer == NULL) { + return (0); + } + + res = timer_list_entry_set_interval(pr_poll_loop_get_timer_list(&instance->main_poll_loop), + client->dpd_timer, + (PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval)); + + return (res); +} diff --git a/qdevices/qnetd-dpd-timer.h b/qdevices/qnetd-client-dpd-timer.h similarity index 74% rename from qdevices/qnetd-dpd-timer.h rename to qdevices/qnetd-client-dpd-timer.h index ddd8fac..9b56fed 100644 --- a/qdevices/qnetd-dpd-timer.h +++ b/qdevices/qnetd-client-dpd-timer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Red Hat, Inc. + * Copyright (c) 2015-2020 Red Hat, Inc. * * All rights reserved. * @@ -32,8 +32,8 @@ * THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _QNETD_DPD_TIMER_H_ -#define _QNETD_DPD_TIMER_H_ +#ifndef _QNETD_CLIENT_DPD_TIMER_H_ +#define _QNETD_CLIENT_DPD_TIMER_H_ #include "qnetd-instance.h" @@ -41,12 +41,20 @@ extern "C" { #endif -extern int qnetd_dpd_timer_init(struct qnetd_instance *instance); +extern int qnetd_client_dpd_timer_init(struct qnetd_instance *instance, + struct qnetd_client *client); -extern void qnetd_dpd_timer_destroy(struct qnetd_instance *instance); +extern void qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance, + struct qnetd_client *client); + +extern void qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance, + struct qnetd_client *client); + +extern int qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance, + struct qnetd_client *client); #ifdef __cplusplus } #endif -#endif /* _QNETD_DPD_TIMER_H_ */ +#endif /* _QNETD_CLIENT_DPD_TIMER_H_ */ diff --git a/qdevices/qnetd-client-msg-received.c b/qdevices/qnetd-client-msg-received.c index 3f1a3d1..3090c09 100644 --- a/qdevices/qnetd-client-msg-received.c +++ b/qdevices/qnetd-client-msg-received.c @@ -40,6 +40,7 @@ #include "qnetd-instance.h" #include "qnetd-log-debug.h" #include "qnetd-client-send.h" +#include "qnetd-client-dpd-timer.h" #include "msg.h" #include "nss-sock.h" @@ -368,6 +369,10 @@ qnetd_client_msg_received_init(struct qnetd_instance *instance, struct qnetd_cli reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL; } else { client->heartbeat_interval = msg->heartbeat_interval; + + if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) { + reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL; + } } } @@ -569,6 +574,15 @@ qnetd_client_msg_received_set_option(struct qnetd_instance *instance, struct qne } client->heartbeat_interval = msg->heartbeat_interval; + + if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) { + if (qnetd_client_send_err(client, msg->seq_number_set, msg->seq_number, + TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL) != 0) { + return (-1); + } + + return (0); + } } if (msg->keep_active_partition_tie_breaker_set) { @@ -1148,7 +1162,7 @@ qnetd_client_msg_received(struct qnetd_instance *instance, struct qnetd_client * int ret_val; int msg_processed; - client->dpd_msg_received_since_last_check = 1; + qnetd_client_dpd_timer_reschedule(instance, client); msg_decoded_init(&msg); diff --git a/qdevices/qnetd-client-net.c b/qdevices/qnetd-client-net.c index a45603e..391bb87 100644 --- a/qdevices/qnetd-client-net.c +++ b/qdevices/qnetd-client-net.c @@ -38,6 +38,7 @@ #include "msgio.h" #include "msg.h" #include "nss-sock.h" +#include "qnetd-client-dpd-timer.h" #include "qnetd-client-net.h" #include "qnetd-client-send.h" #include "qnetd-client-msg-received.h" @@ -322,11 +323,29 @@ qnetd_client_net_accept(struct qnetd_instance *instance) instance, client) == -1) { log(LOG_ERR, "Can't add client to main poll loop"); res_err = -2; - goto exit_close; + goto exit_client_list_del_close; + } + + if (qnetd_client_dpd_timer_init(instance, client) == -1) { + res_err = -2; + goto exit_client_nspr_list_del_close; } return (0); +exit_client_nspr_list_del_close: + if (pr_poll_loop_del_prfd(&instance->main_poll_loop, client_socket) == -1) { + log(LOG_ERR, "pr_poll_loop_del_prfd for client socket failed"); + } + +exit_client_list_del_close: + qnetd_client_list_del(&instance->clients, client); + /* + * client_addr_str is passed to qnetd_client_list_add and becomes part of client struct. + * qnetd_client_list_del calls qnetd_client_destroy which frees this memory + */ + client_addr_str = NULL; + exit_close: free(client_addr_str); PR_Close(client_socket); diff --git a/qdevices/qnetd-client.c b/qdevices/qnetd-client.c index 229123e..2e2b6e9 100644 --- a/qdevices/qnetd-client.c +++ b/qdevices/qnetd-client.c @@ -56,6 +56,10 @@ qnetd_client_init(struct qnetd_client *client, PRFileDesc *sock, PRNetAddr *addr node_list_init(&client->last_membership_node_list); node_list_init(&client->last_quorum_node_list); client->main_timer_list = main_timer_list; + /* + * Set max heartbeat interval before client sends init msg + */ + client->heartbeat_interval = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX; } void diff --git a/qdevices/qnetd-client.h b/qdevices/qnetd-client.h index 429b5f3..e59098d 100644 --- a/qdevices/qnetd-client.h +++ b/qdevices/qnetd-client.h @@ -83,8 +83,7 @@ struct qnetd_client { struct timer_list_entry *algo_timer; uint32_t algo_timer_vote_info_msq_seq_number; int schedule_disconnect; - uint32_t dpd_time_since_last_check; - uint32_t dpd_msg_received_since_last_check; + struct timer_list_entry *dpd_timer; enum tlv_vote last_sent_vote; enum tlv_vote last_sent_ack_nack_vote; enum tlv_heuristics last_membership_heuristics; /* Passed in membership node list */ diff --git a/qdevices/qnetd-instance.c b/qdevices/qnetd-instance.c index a0bc9ed..e8d9d14 100644 --- a/qdevices/qnetd-instance.c +++ b/qdevices/qnetd-instance.c @@ -37,9 +37,9 @@ #include #include "qnetd-instance.h" #include "qnetd-client.h" +#include "qnetd-client-dpd-timer.h" #include "qnetd-algorithm.h" #include "qnetd-log-debug.h" -#include "qnetd-dpd-timer.h" #include "qnetd-client-algo-timer.h" int @@ -62,10 +62,6 @@ qnetd_instance_init(struct qnetd_instance *instance, pr_poll_loop_init(&instance->main_poll_loop); - if (qnetd_dpd_timer_init(instance) != 0) { - return (0); - } - return (0); } @@ -75,8 +71,6 @@ qnetd_instance_destroy(struct qnetd_instance *instance) struct qnetd_client *client; struct qnetd_client *client_next; - qnetd_dpd_timer_destroy(instance); - client = TAILQ_FIRST(&instance->clients); while (client != NULL) { client_next = TAILQ_NEXT(client, entries); @@ -105,6 +99,8 @@ qnetd_instance_client_disconnect(struct qnetd_instance *instance, struct qnetd_c qnetd_algorithm_client_disconnect(client, server_going_down); } + qnetd_client_dpd_timer_destroy(instance, client); + PR_Close(client->socket); if (client->cluster != NULL) { qnetd_cluster_list_del_client(&instance->clusters, client->cluster, client); diff --git a/qdevices/qnetd-instance.h b/qdevices/qnetd-instance.h index 03cfbad..d2777f3 100644 --- a/qdevices/qnetd-instance.h +++ b/qdevices/qnetd-instance.h @@ -67,7 +67,6 @@ struct qnetd_instance { int tls_client_cert_required; const char *host_addr; uint16_t host_port; - struct timer_list_entry *dpd_timer; /* Dead peer detection timer */ struct unix_socket_ipc local_ipc; const struct qnetd_advanced_settings *advanced_settings; struct pr_poll_loop main_poll_loop;