From 2ba4ebe09e9e6de0f162168fa79ab422e96504f7 Mon Sep 17 00:00:00 2001
From: Angus Salkeld <asalkeld@redhat.com>
Date: Thu, 15 Dec 2011 10:43:00 +1100
Subject: [PATCH] Fix cpgbench (large message sizes)

To allow async cpg messages of 1M we need to:
1) increase the totem queue size 4 times
2) align the critical level to one large message free

There are a number of reasons for doing this:

We can't let cpg_mcast_joined() fail because the user will not see it
and will assume is has succeded.

The reason we are getting good performance is by providing a negative
feedback loop from the totem q to the IPC/poll system. This relies
on 4 q states low/med/high/crit. With messages of size 1M you
now have a q of size one and now go from level low to crit instantly
then back to low as messages are put on and taken off. I don't think
this is the best behaviour. By having a q size of 4 allows the system
to utilize the q better and give us time to respond to changes in
the q level.

To effectively achieve flow control with a q of size 1 would require
all the clients to request the space on the q like is done in
totempg_groups_joined_reserve() but probably in shared memory
This would take quite a bit of re-work.

Signed-off-by: Angus Salkeld <asalkeld@redhat.com>
---
 exec/totempg.c                    | 27 +++++++++++++++++----------
 include/corosync/engine/coroapi.h |  6 ++++--
 include/corosync/totem/totem.h    |  2 +-
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/exec/totempg.c b/exec/totempg.c
index 3ece489a..3a43541f 100644
--- a/exec/totempg.c
+++ b/exec/totempg.c
@@ -1011,6 +1011,16 @@ static void send_release (
 	totempg_stats.msg_reserved = totempg_reserved;
 }
 
+#ifndef HAVE_SMALL_MEMORY_FOOTPRINT
+#undef MESSAGE_QUEUE_MAX
+#define MESSAGE_QUEUE_MAX	((4 * MESSAGE_SIZE_MAX) / totempg_totem_config->net_mtu)
+#endif /* HAVE_SMALL_MEMORY_FOOTPRINT */
+
+static uint32_t q_level_precent_used(void)
+{
+	return (100 - (((totemmrp_avail() - totempg_reserved) * 100) / MESSAGE_QUEUE_MAX));
+}
+
 int totempg_callback_token_create (
 	void **handle_out,
 	enum totem_callback_token_type type,
@@ -1188,26 +1198,22 @@ int totempg_groups_mcast_joined (
 static void check_q_level(
 	void *totempg_groups_instance)
 {
-	int32_t old_level;
-	int32_t percent_used = 0;
 	struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
+	int32_t old_level = instance->q_level;
+	int32_t percent_used = q_level_precent_used();
 
-	old_level = instance->q_level;
-	percent_used = 100 - (totemmrp_avail () * 100 / 800); /*(1024*1024/1500)*/
-
-	if (percent_used > 90 && instance->q_level != TOTEM_Q_LEVEL_CRITICAL) {
+	if (percent_used >= 75 && instance->q_level != TOTEM_Q_LEVEL_CRITICAL) {
 		instance->q_level = TOTEM_Q_LEVEL_CRITICAL;
 	} else if (percent_used < 30 && instance->q_level != TOTEM_Q_LEVEL_LOW) {
 		instance->q_level = TOTEM_Q_LEVEL_LOW;
-	} else if (percent_used > 40 && percent_used < 60 && instance->q_level != TOTEM_Q_LEVEL_GOOD) {
+	} else if (percent_used > 40 && percent_used < 50 && instance->q_level != TOTEM_Q_LEVEL_GOOD) {
 		instance->q_level = TOTEM_Q_LEVEL_GOOD;
-	} else if (percent_used > 70 && percent_used < 80 && instance->q_level != TOTEM_Q_LEVEL_HIGH) {
+	} else if (percent_used > 60 && percent_used < 70 && instance->q_level != TOTEM_Q_LEVEL_HIGH) {
 		instance->q_level = TOTEM_Q_LEVEL_HIGH;
 	}
 	if (totem_queue_level_changed && old_level != instance->q_level) {
 		totem_queue_level_changed(instance->q_level);
 	}
-
 }
 
 void totempg_check_q_level(
@@ -1239,7 +1245,6 @@ int totempg_groups_joined_reserve (
 	for (i = 0; i < iov_len; i++) {
 		size += iovec[i].iov_len;
 	}
-	check_q_level(instance);
 
 	if (size >= totempg_size_limit) {
 		reserved = -1;
@@ -1254,6 +1259,8 @@ int totempg_groups_joined_reserve (
 
 
 error_exit:
+	check_q_level(instance);
+
 	if (totempg_threaded_mode == 1) {
 		pthread_mutex_unlock (&mcast_msg_mutex);
 		pthread_mutex_unlock (&totempg_mutex);
diff --git a/include/corosync/engine/coroapi.h b/include/corosync/engine/coroapi.h
index 567d14f9..cabcbb3b 100644
--- a/include/corosync/engine/coroapi.h
+++ b/include/corosync/engine/coroapi.h
@@ -72,15 +72,17 @@ struct corosync_tpg_group {
 
 #define INTERFACE_MAX 2
 
+#ifndef MESSAGE_QUEUE_MAX
 #ifdef HAVE_SMALL_MEMORY_FOOTPRINT
 #define PROCESSOR_COUNT_MAX	16
 #define MESSAGE_SIZE_MAX	1024*64
 #define MESSAGE_QUEUE_MAX	512
 #else
 #define PROCESSOR_COUNT_MAX	384
-#define MESSAGE_SIZE_MAX	1024*1024 /* (1MB) */
-#define MESSAGE_QUEUE_MAX	MESSAGE_SIZE_MAX / totem_config->net_mtu
+#define MESSAGE_SIZE_MAX	1024*1024
+#define MESSAGE_QUEUE_MAX	((4 * MESSAGE_SIZE_MAX) / totem_config->net_mtu)
 #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */
+#endif /* MESSAGE_QUEUE_MAX */
 
 #define TOTEM_AGREED	0
 #define TOTEM_SAFE	1
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
index 2166143a..3d00318b 100644
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -44,7 +44,7 @@
 #else
 #define PROCESSOR_COUNT_MAX	384
 #define MESSAGE_SIZE_MAX	1024*1024 /* (1MB) */
-#define MESSAGE_QUEUE_MAX	MESSAGE_SIZE_MAX / totem_config->net_mtu
+#define MESSAGE_QUEUE_MAX	((4 * MESSAGE_SIZE_MAX) / totem_config->net_mtu)
 #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */
 
 #define FRAME_SIZE_MAX		10000