qnetd: Improve dead peer detection

Previously dead peer detection timer was scheduled every dpd_interval,
added dpd_interval to all of the clients timestamp and if timestamp was
larger than client hearbeat interval * 1.2 then check if client sent
some message. If so, flag was reset.

This method was source of number of problems so instead different method
is now used.

Every single client has its own timer with timeout based on
(configurable) dpd_interval_coefficient and multiplied with
client heartbeat timeout. When message is received from client timer is
rescheduled. When timer callback is called (= client doesn't sent
message during timeout) then client is disconnected.

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
This commit is contained in:
Jan Friesse 2020-11-18 14:31:01 +01:00
parent 8211cf2394
commit a8b7513df9
14 changed files with 127 additions and 61 deletions

View File

@ -31,7 +31,7 @@
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
.\" * THE POSSIBILITY OF SUCH DAMAGE.
.\" */
.TH COROSYNC-QNETD 8 2020-09-22
.TH COROSYNC-QNETD 8 2020-11-18
.SH NAME
corosync-qnetd \- QNet daemon
.SH SYNOPSIS
@ -216,8 +216,9 @@ Maximum heartbeat timeout accepted by server in ms. (120000)
.B dpd_enabled
Dead peer detection enabled. (on)
.TP
.B dpd_interval
How often the DPD algorithm detects dead peers in ms. (1000)
.B dpd_interval_coefficient
Value is multiplied with heartbeat interval sent by qdevice client and used as a timeout
for dead peer detection. (1.5)
.TP
.B lock_file
Lock file location. (/var/run/corosync-qnetd/corosync-qnetd.pid)

View File

@ -64,7 +64,7 @@ corosync_qnetd_SOURCES = corosync-qnetd.c \
qnetd-client-msg-received.c qnetd-client-msg-received.h \
qnetd-log-debug.c qnetd-log-debug.h \
qnetd-client-algo-timer.c qnetd-client-algo-timer.h \
qnetd-dpd-timer.c qnetd-dpd-timer.h \
qnetd-client-dpd-timer.c qnetd-client-dpd-timer.h \
qnetd-ipc.c qnetd-ipc.h unix-socket-ipc.c unix-socket-ipc.h \
dynar-simple-lex.c dynar-simple-lex.h dynar-str.c dynar-str.h \
unix-socket-client.c unix-socket-client.h \

View File

@ -237,6 +237,9 @@ cli_parse_long_opt(struct qnetd_advanced_settings *advanced_settings, const char
case -2:
errx(EXIT_FAILURE, "Invalid value '%s' for option '%s'", val, opt);
break;
case -3:
warnx("Option '%s' is deprecated and has no effect anymore", opt);
break;
}
}

View File

@ -71,8 +71,9 @@ extern "C" {
#define QNETD_MIN_HEARTBEAT_INTERVAL 1
#define QNETD_DEFAULT_DPD_ENABLED 1
#define QNETD_DEFAULT_DPD_INTERVAL (1*1000)
#define QNETD_MIN_DPD_INTERVAL 1
#define QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT 1.5
#define QNETD_MIN_DPD_INTERVAL_COEFFICIENT 1
#define QNETD_MAX_DPD_INTERVAL_COEFFICIENT 1000
#define QNETD_DEFAULT_LOCK_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.pid"
#define QNETD_DEFAULT_LOCAL_SOCKET_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.sock"

View File

@ -63,7 +63,7 @@ qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings)
settings->heartbeat_interval_min = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MIN;
settings->heartbeat_interval_max = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
settings->dpd_enabled = QNETD_DEFAULT_DPD_ENABLED;
settings->dpd_interval = QNETD_DEFAULT_DPD_INTERVAL;
settings->dpd_interval_coefficient = QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT;
if ((settings->lock_file = strdup(QNETD_DEFAULT_LOCK_FILE)) == NULL) {
return (-1);
}
@ -94,12 +94,14 @@ qnetd_advanced_settings_destroy(struct qnetd_advanced_settings *settings)
* 0 - No error
* -1 - Unknown option
* -2 - Incorrect value
* -3 - Deprecated value
*/
int
qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
const char *option, const char *value)
{
long long int tmpll;
double tmpdbl;
if (strcasecmp(option, "listen_backlog") == 0) {
if (utils_strtonum(value, QNETD_MIN_LISTEN_BACKLOG, INT_MAX, &tmpll) == -1) {
@ -156,11 +158,14 @@ qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
settings->dpd_enabled = (uint8_t)tmpll;
} else if (strcasecmp(option, "dpd_interval") == 0) {
if (utils_strtonum(value, QNETD_MIN_DPD_INTERVAL, UINT32_MAX, &tmpll) == -1) {
return (-3);
} else if (strcasecmp(option, "dpd_interval_coefficient") == 0) {
if (utils_strtod(value, QNETD_MIN_DPD_INTERVAL_COEFFICIENT,
QNETD_MAX_DPD_INTERVAL_COEFFICIENT, &tmpdbl) == -1) {
return (-2);
}
settings->dpd_interval = (uint32_t)tmpll;
settings->dpd_interval_coefficient = tmpdbl;
} else if (strcasecmp(option, "lock_file") == 0) {
free(settings->lock_file);

View File

@ -49,7 +49,6 @@ struct qnetd_advanced_settings {
uint32_t heartbeat_interval_min;
uint32_t heartbeat_interval_max;
uint8_t dpd_enabled;
uint32_t dpd_interval;
char *lock_file;
char *local_socket_file;
int local_socket_backlog;
@ -57,6 +56,7 @@ struct qnetd_advanced_settings {
size_t ipc_max_send_size;
size_t ipc_max_receive_size;
enum tlv_keep_active_partition_tie_breaker keep_active_partition_tie_breaker;
double dpd_interval_coefficient;
};
extern int qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings);

View File

@ -33,53 +33,42 @@
*/
#include "log.h"
#include "qnetd-dpd-timer.h"
#include "qnetd-client-dpd-timer.h"
static int
qnetd_dpd_timer_cb(void *data1, void *data2)
{
struct qnetd_instance *instance;
struct qnetd_client *client;
instance = (struct qnetd_instance *)data1;
client = (struct qnetd_client *)data1;
TAILQ_FOREACH(client, &instance->clients, entries) {
if (!client->init_received) {
continue;
}
log(LOG_WARNING, "Client %s doesn't sent any message during "
"%" PRIu32 "ms. Disconnecting",
client->addr_str,
timer_list_entry_get_interval(client->dpd_timer));
client->dpd_time_since_last_check += instance->advanced_settings->dpd_interval;
client->schedule_disconnect = 1;
/*
* Timer gets removed by timer-list because of returning 0
*/
client->dpd_timer = NULL;
if (client->dpd_time_since_last_check > client->heartbeat_interval * 1.2) {
if (!client->dpd_msg_received_since_last_check) {
log(LOG_WARNING, "Client %s doesn't sent any message during "
"%"PRIu32"ms. Disconnecting",
client->addr_str, client->dpd_time_since_last_check);
client->schedule_disconnect = 1;
} else {
client->dpd_time_since_last_check = 0;
client->dpd_msg_received_since_last_check = 0;
}
}
}
return (-1);
return (0);
}
int
qnetd_dpd_timer_init(struct qnetd_instance *instance)
qnetd_client_dpd_timer_init(struct qnetd_instance *instance, struct qnetd_client *client)
{
if (!instance->advanced_settings->dpd_enabled) {
return (0);
}
instance->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
instance->advanced_settings->dpd_interval,
qnetd_dpd_timer_cb, (void *)instance, NULL);
if (instance->dpd_timer == NULL) {
log(LOG_ERR, "Can't initialize dpd timer");
client->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
(PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval),
qnetd_dpd_timer_cb, (void *)client, NULL);
if (client->dpd_timer == NULL) {
log(LOG_ERR, "Can't initialize dpd timer for client %s", client->addr_str);
return (-1);
}
@ -88,11 +77,39 @@ qnetd_dpd_timer_init(struct qnetd_instance *instance)
}
void
qnetd_dpd_timer_destroy(struct qnetd_instance *instance)
qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance, struct qnetd_client *client)
{
if (instance->dpd_timer != NULL) {
timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop), instance->dpd_timer);
instance->dpd_timer = NULL;
if (client->dpd_timer != NULL) {
timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
client->dpd_timer);
client->dpd_timer = NULL;
}
}
void
qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance, struct qnetd_client *client)
{
if (client->dpd_timer != NULL) {
timer_list_entry_reschedule(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
client->dpd_timer);
}
}
int
qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance, struct qnetd_client *client)
{
int res;
if (client->dpd_timer == NULL) {
return (0);
}
res = timer_list_entry_set_interval(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
client->dpd_timer,
(PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval));
return (res);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016 Red Hat, Inc.
* Copyright (c) 2015-2020 Red Hat, Inc.
*
* All rights reserved.
*
@ -32,8 +32,8 @@
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _QNETD_DPD_TIMER_H_
#define _QNETD_DPD_TIMER_H_
#ifndef _QNETD_CLIENT_DPD_TIMER_H_
#define _QNETD_CLIENT_DPD_TIMER_H_
#include "qnetd-instance.h"
@ -41,12 +41,20 @@
extern "C" {
#endif
extern int qnetd_dpd_timer_init(struct qnetd_instance *instance);
extern int qnetd_client_dpd_timer_init(struct qnetd_instance *instance,
struct qnetd_client *client);
extern void qnetd_dpd_timer_destroy(struct qnetd_instance *instance);
extern void qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance,
struct qnetd_client *client);
extern void qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance,
struct qnetd_client *client);
extern int qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance,
struct qnetd_client *client);
#ifdef __cplusplus
}
#endif
#endif /* _QNETD_DPD_TIMER_H_ */
#endif /* _QNETD_CLIENT_DPD_TIMER_H_ */

View File

@ -40,6 +40,7 @@
#include "qnetd-instance.h"
#include "qnetd-log-debug.h"
#include "qnetd-client-send.h"
#include "qnetd-client-dpd-timer.h"
#include "msg.h"
#include "nss-sock.h"
@ -368,6 +369,10 @@ qnetd_client_msg_received_init(struct qnetd_instance *instance, struct qnetd_cli
reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
} else {
client->heartbeat_interval = msg->heartbeat_interval;
if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
}
}
}
@ -569,6 +574,15 @@ qnetd_client_msg_received_set_option(struct qnetd_instance *instance, struct qne
}
client->heartbeat_interval = msg->heartbeat_interval;
if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
if (qnetd_client_send_err(client, msg->seq_number_set, msg->seq_number,
TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL) != 0) {
return (-1);
}
return (0);
}
}
if (msg->keep_active_partition_tie_breaker_set) {
@ -1148,7 +1162,7 @@ qnetd_client_msg_received(struct qnetd_instance *instance, struct qnetd_client *
int ret_val;
int msg_processed;
client->dpd_msg_received_since_last_check = 1;
qnetd_client_dpd_timer_reschedule(instance, client);
msg_decoded_init(&msg);

View File

@ -38,6 +38,7 @@
#include "msgio.h"
#include "msg.h"
#include "nss-sock.h"
#include "qnetd-client-dpd-timer.h"
#include "qnetd-client-net.h"
#include "qnetd-client-send.h"
#include "qnetd-client-msg-received.h"
@ -322,11 +323,29 @@ qnetd_client_net_accept(struct qnetd_instance *instance)
instance, client) == -1) {
log(LOG_ERR, "Can't add client to main poll loop");
res_err = -2;
goto exit_close;
goto exit_client_list_del_close;
}
if (qnetd_client_dpd_timer_init(instance, client) == -1) {
res_err = -2;
goto exit_client_nspr_list_del_close;
}
return (0);
exit_client_nspr_list_del_close:
if (pr_poll_loop_del_prfd(&instance->main_poll_loop, client_socket) == -1) {
log(LOG_ERR, "pr_poll_loop_del_prfd for client socket failed");
}
exit_client_list_del_close:
qnetd_client_list_del(&instance->clients, client);
/*
* client_addr_str is passed to qnetd_client_list_add and becomes part of client struct.
* qnetd_client_list_del calls qnetd_client_destroy which frees this memory
*/
client_addr_str = NULL;
exit_close:
free(client_addr_str);
PR_Close(client_socket);

View File

@ -56,6 +56,10 @@ qnetd_client_init(struct qnetd_client *client, PRFileDesc *sock, PRNetAddr *addr
node_list_init(&client->last_membership_node_list);
node_list_init(&client->last_quorum_node_list);
client->main_timer_list = main_timer_list;
/*
* Set max heartbeat interval before client sends init msg
*/
client->heartbeat_interval = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
}
void

View File

@ -83,8 +83,7 @@ struct qnetd_client {
struct timer_list_entry *algo_timer;
uint32_t algo_timer_vote_info_msq_seq_number;
int schedule_disconnect;
uint32_t dpd_time_since_last_check;
uint32_t dpd_msg_received_since_last_check;
struct timer_list_entry *dpd_timer;
enum tlv_vote last_sent_vote;
enum tlv_vote last_sent_ack_nack_vote;
enum tlv_heuristics last_membership_heuristics; /* Passed in membership node list */

View File

@ -37,9 +37,9 @@
#include <pk11func.h>
#include "qnetd-instance.h"
#include "qnetd-client.h"
#include "qnetd-client-dpd-timer.h"
#include "qnetd-algorithm.h"
#include "qnetd-log-debug.h"
#include "qnetd-dpd-timer.h"
#include "qnetd-client-algo-timer.h"
int
@ -62,10 +62,6 @@ qnetd_instance_init(struct qnetd_instance *instance,
pr_poll_loop_init(&instance->main_poll_loop);
if (qnetd_dpd_timer_init(instance) != 0) {
return (0);
}
return (0);
}
@ -75,8 +71,6 @@ qnetd_instance_destroy(struct qnetd_instance *instance)
struct qnetd_client *client;
struct qnetd_client *client_next;
qnetd_dpd_timer_destroy(instance);
client = TAILQ_FIRST(&instance->clients);
while (client != NULL) {
client_next = TAILQ_NEXT(client, entries);
@ -105,6 +99,8 @@ qnetd_instance_client_disconnect(struct qnetd_instance *instance, struct qnetd_c
qnetd_algorithm_client_disconnect(client, server_going_down);
}
qnetd_client_dpd_timer_destroy(instance, client);
PR_Close(client->socket);
if (client->cluster != NULL) {
qnetd_cluster_list_del_client(&instance->clusters, client->cluster, client);

View File

@ -67,7 +67,6 @@ struct qnetd_instance {
int tls_client_cert_required;
const char *host_addr;
uint16_t host_port;
struct timer_list_entry *dpd_timer; /* Dead peer detection timer */
struct unix_socket_ipc local_ipc;
const struct qnetd_advanced_settings *advanced_settings;
struct pr_poll_loop main_poll_loop;