mirror of
https://git.proxmox.com/git/mirror_corosync-qdevice
synced 2025-04-28 12:32:27 +00:00
qnetd: Improve dead peer detection
Previously dead peer detection timer was scheduled every dpd_interval, added dpd_interval to all of the clients timestamp and if timestamp was larger than client hearbeat interval * 1.2 then check if client sent some message. If so, flag was reset. This method was source of number of problems so instead different method is now used. Every single client has its own timer with timeout based on (configurable) dpd_interval_coefficient and multiplied with client heartbeat timeout. When message is received from client timer is rescheduled. When timer callback is called (= client doesn't sent message during timeout) then client is disconnected. Signed-off-by: Jan Friesse <jfriesse@redhat.com>
This commit is contained in:
parent
8211cf2394
commit
a8b7513df9
@ -31,7 +31,7 @@
|
||||
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
.\" * THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.\" */
|
||||
.TH COROSYNC-QNETD 8 2020-09-22
|
||||
.TH COROSYNC-QNETD 8 2020-11-18
|
||||
.SH NAME
|
||||
corosync-qnetd \- QNet daemon
|
||||
.SH SYNOPSIS
|
||||
@ -216,8 +216,9 @@ Maximum heartbeat timeout accepted by server in ms. (120000)
|
||||
.B dpd_enabled
|
||||
Dead peer detection enabled. (on)
|
||||
.TP
|
||||
.B dpd_interval
|
||||
How often the DPD algorithm detects dead peers in ms. (1000)
|
||||
.B dpd_interval_coefficient
|
||||
Value is multiplied with heartbeat interval sent by qdevice client and used as a timeout
|
||||
for dead peer detection. (1.5)
|
||||
.TP
|
||||
.B lock_file
|
||||
Lock file location. (/var/run/corosync-qnetd/corosync-qnetd.pid)
|
||||
|
@ -64,7 +64,7 @@ corosync_qnetd_SOURCES = corosync-qnetd.c \
|
||||
qnetd-client-msg-received.c qnetd-client-msg-received.h \
|
||||
qnetd-log-debug.c qnetd-log-debug.h \
|
||||
qnetd-client-algo-timer.c qnetd-client-algo-timer.h \
|
||||
qnetd-dpd-timer.c qnetd-dpd-timer.h \
|
||||
qnetd-client-dpd-timer.c qnetd-client-dpd-timer.h \
|
||||
qnetd-ipc.c qnetd-ipc.h unix-socket-ipc.c unix-socket-ipc.h \
|
||||
dynar-simple-lex.c dynar-simple-lex.h dynar-str.c dynar-str.h \
|
||||
unix-socket-client.c unix-socket-client.h \
|
||||
|
@ -237,6 +237,9 @@ cli_parse_long_opt(struct qnetd_advanced_settings *advanced_settings, const char
|
||||
case -2:
|
||||
errx(EXIT_FAILURE, "Invalid value '%s' for option '%s'", val, opt);
|
||||
break;
|
||||
case -3:
|
||||
warnx("Option '%s' is deprecated and has no effect anymore", opt);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -71,8 +71,9 @@ extern "C" {
|
||||
#define QNETD_MIN_HEARTBEAT_INTERVAL 1
|
||||
|
||||
#define QNETD_DEFAULT_DPD_ENABLED 1
|
||||
#define QNETD_DEFAULT_DPD_INTERVAL (1*1000)
|
||||
#define QNETD_MIN_DPD_INTERVAL 1
|
||||
#define QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT 1.5
|
||||
#define QNETD_MIN_DPD_INTERVAL_COEFFICIENT 1
|
||||
#define QNETD_MAX_DPD_INTERVAL_COEFFICIENT 1000
|
||||
|
||||
#define QNETD_DEFAULT_LOCK_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.pid"
|
||||
#define QNETD_DEFAULT_LOCAL_SOCKET_FILE LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.sock"
|
||||
|
@ -63,7 +63,7 @@ qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings)
|
||||
settings->heartbeat_interval_min = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MIN;
|
||||
settings->heartbeat_interval_max = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
|
||||
settings->dpd_enabled = QNETD_DEFAULT_DPD_ENABLED;
|
||||
settings->dpd_interval = QNETD_DEFAULT_DPD_INTERVAL;
|
||||
settings->dpd_interval_coefficient = QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT;
|
||||
if ((settings->lock_file = strdup(QNETD_DEFAULT_LOCK_FILE)) == NULL) {
|
||||
return (-1);
|
||||
}
|
||||
@ -94,12 +94,14 @@ qnetd_advanced_settings_destroy(struct qnetd_advanced_settings *settings)
|
||||
* 0 - No error
|
||||
* -1 - Unknown option
|
||||
* -2 - Incorrect value
|
||||
* -3 - Deprecated value
|
||||
*/
|
||||
int
|
||||
qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
|
||||
const char *option, const char *value)
|
||||
{
|
||||
long long int tmpll;
|
||||
double tmpdbl;
|
||||
|
||||
if (strcasecmp(option, "listen_backlog") == 0) {
|
||||
if (utils_strtonum(value, QNETD_MIN_LISTEN_BACKLOG, INT_MAX, &tmpll) == -1) {
|
||||
@ -156,11 +158,14 @@ qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
|
||||
|
||||
settings->dpd_enabled = (uint8_t)tmpll;
|
||||
} else if (strcasecmp(option, "dpd_interval") == 0) {
|
||||
if (utils_strtonum(value, QNETD_MIN_DPD_INTERVAL, UINT32_MAX, &tmpll) == -1) {
|
||||
return (-3);
|
||||
} else if (strcasecmp(option, "dpd_interval_coefficient") == 0) {
|
||||
if (utils_strtod(value, QNETD_MIN_DPD_INTERVAL_COEFFICIENT,
|
||||
QNETD_MAX_DPD_INTERVAL_COEFFICIENT, &tmpdbl) == -1) {
|
||||
return (-2);
|
||||
}
|
||||
|
||||
settings->dpd_interval = (uint32_t)tmpll;
|
||||
settings->dpd_interval_coefficient = tmpdbl;
|
||||
} else if (strcasecmp(option, "lock_file") == 0) {
|
||||
free(settings->lock_file);
|
||||
|
||||
|
@ -49,7 +49,6 @@ struct qnetd_advanced_settings {
|
||||
uint32_t heartbeat_interval_min;
|
||||
uint32_t heartbeat_interval_max;
|
||||
uint8_t dpd_enabled;
|
||||
uint32_t dpd_interval;
|
||||
char *lock_file;
|
||||
char *local_socket_file;
|
||||
int local_socket_backlog;
|
||||
@ -57,6 +56,7 @@ struct qnetd_advanced_settings {
|
||||
size_t ipc_max_send_size;
|
||||
size_t ipc_max_receive_size;
|
||||
enum tlv_keep_active_partition_tie_breaker keep_active_partition_tie_breaker;
|
||||
double dpd_interval_coefficient;
|
||||
};
|
||||
|
||||
extern int qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings);
|
||||
|
@ -33,53 +33,42 @@
|
||||
*/
|
||||
|
||||
#include "log.h"
|
||||
#include "qnetd-dpd-timer.h"
|
||||
#include "qnetd-client-dpd-timer.h"
|
||||
|
||||
static int
|
||||
qnetd_dpd_timer_cb(void *data1, void *data2)
|
||||
{
|
||||
struct qnetd_instance *instance;
|
||||
struct qnetd_client *client;
|
||||
|
||||
instance = (struct qnetd_instance *)data1;
|
||||
client = (struct qnetd_client *)data1;
|
||||
|
||||
TAILQ_FOREACH(client, &instance->clients, entries) {
|
||||
if (!client->init_received) {
|
||||
continue;
|
||||
}
|
||||
log(LOG_WARNING, "Client %s doesn't sent any message during "
|
||||
"%" PRIu32 "ms. Disconnecting",
|
||||
client->addr_str,
|
||||
timer_list_entry_get_interval(client->dpd_timer));
|
||||
|
||||
client->dpd_time_since_last_check += instance->advanced_settings->dpd_interval;
|
||||
client->schedule_disconnect = 1;
|
||||
/*
|
||||
* Timer gets removed by timer-list because of returning 0
|
||||
*/
|
||||
client->dpd_timer = NULL;
|
||||
|
||||
if (client->dpd_time_since_last_check > client->heartbeat_interval * 1.2) {
|
||||
if (!client->dpd_msg_received_since_last_check) {
|
||||
log(LOG_WARNING, "Client %s doesn't sent any message during "
|
||||
"%"PRIu32"ms. Disconnecting",
|
||||
client->addr_str, client->dpd_time_since_last_check);
|
||||
|
||||
client->schedule_disconnect = 1;
|
||||
} else {
|
||||
client->dpd_time_since_last_check = 0;
|
||||
client->dpd_msg_received_since_last_check = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (-1);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
qnetd_dpd_timer_init(struct qnetd_instance *instance)
|
||||
qnetd_client_dpd_timer_init(struct qnetd_instance *instance, struct qnetd_client *client)
|
||||
{
|
||||
|
||||
if (!instance->advanced_settings->dpd_enabled) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
instance->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
|
||||
instance->advanced_settings->dpd_interval,
|
||||
qnetd_dpd_timer_cb, (void *)instance, NULL);
|
||||
if (instance->dpd_timer == NULL) {
|
||||
log(LOG_ERR, "Can't initialize dpd timer");
|
||||
client->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
|
||||
(PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval),
|
||||
qnetd_dpd_timer_cb, (void *)client, NULL);
|
||||
if (client->dpd_timer == NULL) {
|
||||
log(LOG_ERR, "Can't initialize dpd timer for client %s", client->addr_str);
|
||||
|
||||
return (-1);
|
||||
}
|
||||
@ -88,11 +77,39 @@ qnetd_dpd_timer_init(struct qnetd_instance *instance)
|
||||
}
|
||||
|
||||
void
|
||||
qnetd_dpd_timer_destroy(struct qnetd_instance *instance)
|
||||
qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance, struct qnetd_client *client)
|
||||
{
|
||||
|
||||
if (instance->dpd_timer != NULL) {
|
||||
timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop), instance->dpd_timer);
|
||||
instance->dpd_timer = NULL;
|
||||
if (client->dpd_timer != NULL) {
|
||||
timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
|
||||
client->dpd_timer);
|
||||
|
||||
client->dpd_timer = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance, struct qnetd_client *client)
|
||||
{
|
||||
|
||||
if (client->dpd_timer != NULL) {
|
||||
timer_list_entry_reschedule(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
|
||||
client->dpd_timer);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance, struct qnetd_client *client)
|
||||
{
|
||||
int res;
|
||||
|
||||
if (client->dpd_timer == NULL) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
res = timer_list_entry_set_interval(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
|
||||
client->dpd_timer,
|
||||
(PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval));
|
||||
|
||||
return (res);
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016 Red Hat, Inc.
|
||||
* Copyright (c) 2015-2020 Red Hat, Inc.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
@ -32,8 +32,8 @@
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _QNETD_DPD_TIMER_H_
|
||||
#define _QNETD_DPD_TIMER_H_
|
||||
#ifndef _QNETD_CLIENT_DPD_TIMER_H_
|
||||
#define _QNETD_CLIENT_DPD_TIMER_H_
|
||||
|
||||
#include "qnetd-instance.h"
|
||||
|
||||
@ -41,12 +41,20 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern int qnetd_dpd_timer_init(struct qnetd_instance *instance);
|
||||
extern int qnetd_client_dpd_timer_init(struct qnetd_instance *instance,
|
||||
struct qnetd_client *client);
|
||||
|
||||
extern void qnetd_dpd_timer_destroy(struct qnetd_instance *instance);
|
||||
extern void qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance,
|
||||
struct qnetd_client *client);
|
||||
|
||||
extern void qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance,
|
||||
struct qnetd_client *client);
|
||||
|
||||
extern int qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance,
|
||||
struct qnetd_client *client);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _QNETD_DPD_TIMER_H_ */
|
||||
#endif /* _QNETD_CLIENT_DPD_TIMER_H_ */
|
@ -40,6 +40,7 @@
|
||||
#include "qnetd-instance.h"
|
||||
#include "qnetd-log-debug.h"
|
||||
#include "qnetd-client-send.h"
|
||||
#include "qnetd-client-dpd-timer.h"
|
||||
#include "msg.h"
|
||||
#include "nss-sock.h"
|
||||
|
||||
@ -368,6 +369,10 @@ qnetd_client_msg_received_init(struct qnetd_instance *instance, struct qnetd_cli
|
||||
reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
|
||||
} else {
|
||||
client->heartbeat_interval = msg->heartbeat_interval;
|
||||
|
||||
if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
|
||||
reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -569,6 +574,15 @@ qnetd_client_msg_received_set_option(struct qnetd_instance *instance, struct qne
|
||||
}
|
||||
|
||||
client->heartbeat_interval = msg->heartbeat_interval;
|
||||
|
||||
if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
|
||||
if (qnetd_client_send_err(client, msg->seq_number_set, msg->seq_number,
|
||||
TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL) != 0) {
|
||||
return (-1);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
|
||||
if (msg->keep_active_partition_tie_breaker_set) {
|
||||
@ -1148,7 +1162,7 @@ qnetd_client_msg_received(struct qnetd_instance *instance, struct qnetd_client *
|
||||
int ret_val;
|
||||
int msg_processed;
|
||||
|
||||
client->dpd_msg_received_since_last_check = 1;
|
||||
qnetd_client_dpd_timer_reschedule(instance, client);
|
||||
|
||||
msg_decoded_init(&msg);
|
||||
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "msgio.h"
|
||||
#include "msg.h"
|
||||
#include "nss-sock.h"
|
||||
#include "qnetd-client-dpd-timer.h"
|
||||
#include "qnetd-client-net.h"
|
||||
#include "qnetd-client-send.h"
|
||||
#include "qnetd-client-msg-received.h"
|
||||
@ -322,11 +323,29 @@ qnetd_client_net_accept(struct qnetd_instance *instance)
|
||||
instance, client) == -1) {
|
||||
log(LOG_ERR, "Can't add client to main poll loop");
|
||||
res_err = -2;
|
||||
goto exit_close;
|
||||
goto exit_client_list_del_close;
|
||||
}
|
||||
|
||||
if (qnetd_client_dpd_timer_init(instance, client) == -1) {
|
||||
res_err = -2;
|
||||
goto exit_client_nspr_list_del_close;
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
exit_client_nspr_list_del_close:
|
||||
if (pr_poll_loop_del_prfd(&instance->main_poll_loop, client_socket) == -1) {
|
||||
log(LOG_ERR, "pr_poll_loop_del_prfd for client socket failed");
|
||||
}
|
||||
|
||||
exit_client_list_del_close:
|
||||
qnetd_client_list_del(&instance->clients, client);
|
||||
/*
|
||||
* client_addr_str is passed to qnetd_client_list_add and becomes part of client struct.
|
||||
* qnetd_client_list_del calls qnetd_client_destroy which frees this memory
|
||||
*/
|
||||
client_addr_str = NULL;
|
||||
|
||||
exit_close:
|
||||
free(client_addr_str);
|
||||
PR_Close(client_socket);
|
||||
|
@ -56,6 +56,10 @@ qnetd_client_init(struct qnetd_client *client, PRFileDesc *sock, PRNetAddr *addr
|
||||
node_list_init(&client->last_membership_node_list);
|
||||
node_list_init(&client->last_quorum_node_list);
|
||||
client->main_timer_list = main_timer_list;
|
||||
/*
|
||||
* Set max heartbeat interval before client sends init msg
|
||||
*/
|
||||
client->heartbeat_interval = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -83,8 +83,7 @@ struct qnetd_client {
|
||||
struct timer_list_entry *algo_timer;
|
||||
uint32_t algo_timer_vote_info_msq_seq_number;
|
||||
int schedule_disconnect;
|
||||
uint32_t dpd_time_since_last_check;
|
||||
uint32_t dpd_msg_received_since_last_check;
|
||||
struct timer_list_entry *dpd_timer;
|
||||
enum tlv_vote last_sent_vote;
|
||||
enum tlv_vote last_sent_ack_nack_vote;
|
||||
enum tlv_heuristics last_membership_heuristics; /* Passed in membership node list */
|
||||
|
@ -37,9 +37,9 @@
|
||||
#include <pk11func.h>
|
||||
#include "qnetd-instance.h"
|
||||
#include "qnetd-client.h"
|
||||
#include "qnetd-client-dpd-timer.h"
|
||||
#include "qnetd-algorithm.h"
|
||||
#include "qnetd-log-debug.h"
|
||||
#include "qnetd-dpd-timer.h"
|
||||
#include "qnetd-client-algo-timer.h"
|
||||
|
||||
int
|
||||
@ -62,10 +62,6 @@ qnetd_instance_init(struct qnetd_instance *instance,
|
||||
|
||||
pr_poll_loop_init(&instance->main_poll_loop);
|
||||
|
||||
if (qnetd_dpd_timer_init(instance) != 0) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -75,8 +71,6 @@ qnetd_instance_destroy(struct qnetd_instance *instance)
|
||||
struct qnetd_client *client;
|
||||
struct qnetd_client *client_next;
|
||||
|
||||
qnetd_dpd_timer_destroy(instance);
|
||||
|
||||
client = TAILQ_FIRST(&instance->clients);
|
||||
while (client != NULL) {
|
||||
client_next = TAILQ_NEXT(client, entries);
|
||||
@ -105,6 +99,8 @@ qnetd_instance_client_disconnect(struct qnetd_instance *instance, struct qnetd_c
|
||||
qnetd_algorithm_client_disconnect(client, server_going_down);
|
||||
}
|
||||
|
||||
qnetd_client_dpd_timer_destroy(instance, client);
|
||||
|
||||
PR_Close(client->socket);
|
||||
if (client->cluster != NULL) {
|
||||
qnetd_cluster_list_del_client(&instance->clusters, client->cluster, client);
|
||||
|
@ -67,7 +67,6 @@ struct qnetd_instance {
|
||||
int tls_client_cert_required;
|
||||
const char *host_addr;
|
||||
uint16_t host_port;
|
||||
struct timer_list_entry *dpd_timer; /* Dead peer detection timer */
|
||||
struct unix_socket_ipc local_ipc;
|
||||
const struct qnetd_advanced_settings *advanced_settings;
|
||||
struct pr_poll_loop main_poll_loop;
|
||||
|
Loading…
Reference in New Issue
Block a user