From 4caade92a4b59be8204201deafdb4621dc9f5905 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 9 Oct 2023 13:22:04 +0100 Subject: [PATCH 1/4] prevent multiple node disconnects in unstable networks when nodes are already disconnected, thus preventing application controller timeouts and node monitor terminations, which in some cases fails to receover completely. (cherry picked from commit 7ce9d8688a6bbb4650ec195765a7b747b3849bb1) (cherry picked from commit 0e57d92f1158ab0df1111a9367ca77d07d9a1d22) --- deps/rabbit/src/rabbit_node_monitor.erl | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl index 21abae0cea32..5fd455b135a0 100644 --- a/deps/rabbit/src/rabbit_node_monitor.erl +++ b/deps/rabbit/src/rabbit_node_monitor.erl @@ -34,6 +34,7 @@ -define(NODE_REPLY_TIMEOUT, 5000). -define(RABBIT_UP_RPC_TIMEOUT, 2000). -define(RABBIT_DOWN_PING_INTERVAL, 1000). +-define(NODE_DISCONNECTION_TIMEOUT, 1000). -record(state, {monitors, partitions, subscribers, down_ping_timer, keepalive_timer, autoheal, guid, node_guids}). @@ -893,13 +894,23 @@ upgrade_to_full_partition(Proxy) -> %% detect a very short partition. So we want to force a slightly %% longer disconnect. Unfortunately we don't have a way to blacklist %% individual nodes; the best we can do is turn off auto-connect -%% altogether. +%% altogether. If Node is not already part of the connected nodes, then +%% there's no need to repeat disabling dist_auto_connect and executing +%% disconnect_node/1, which can result in application_controller +%% timeouts and crash node monitor process. This also implies that +%% the already disconnected node was already processed. In an +%% unstable network, if we get consecutive 'up' and 'down' messages, +%% then we expect disconnect_node/1 to be executed. disconnect(Node) -> - application:set_env(kernel, dist_auto_connect, never), - erlang:disconnect_node(Node), - timer:sleep(1000), - application:unset_env(kernel, dist_auto_connect), - ok. + case lists:member(Node, nodes()) of + true -> + application:set_env(kernel, dist_auto_connect, never), + erlang:disconnect_node(Node), + timer:sleep(?NODE_DISCONNECTION_TIMEOUT), + application:unset_env(kernel, dist_auto_connect); + false -> + ok + end. %%-------------------------------------------------------------------- From 2a84c2451ec937f86772b1b08004cd5e3a5a46d8 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 9 Oct 2023 14:02:51 +0100 Subject: [PATCH 2/4] announce partition handling mechanism in node monitor start-up log (cherry picked from commit bc1af7c0c245c958892c0c7f97d337a0d69b8b8a) (cherry picked from commit 864ec125a3a0fda6a754efe4bf37509a9797dd28) # Conflicts: # deps/rabbit/src/rabbit_node_monitor.erl --- deps/rabbit/src/rabbit_node_monitor.erl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl index 5fd455b135a0..fb1e3479c459 100644 --- a/deps/rabbit/src/rabbit_node_monitor.erl +++ b/deps/rabbit/src/rabbit_node_monitor.erl @@ -982,8 +982,17 @@ ping_all() -> possibly_partitioned_nodes() -> alive_rabbit_nodes() -- rabbit_nodes:all_running(). -startup_log([]) -> - rabbit_log:info("Starting rabbit_node_monitor", []); startup_log(Nodes) -> +<<<<<<< HEAD rabbit_log:info("Starting rabbit_node_monitor, might be partitioned from ~p", [Nodes]). +======= + {ok, M} = application:get_env(rabbit, cluster_partition_handling), + startup_log(Nodes, M). + +startup_log([], PartitionHandling) -> + rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode)", [PartitionHandling]); +startup_log(Nodes, PartitionHandling) -> + rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode), might be partitioned from ~tp", + [Nodes, PartitionHandling]). +>>>>>>> 864ec125a3 (announce partition handling mechanism in node monitor start-up log) From e5f46b0cd651993a52cf2da61672fdf5147218f1 Mon Sep 17 00:00:00 2001 From: Ayanda Dube Date: Mon, 9 Oct 2023 14:56:10 +0100 Subject: [PATCH 3/4] oops, fix parameter ordering (cherry picked from commit 6e5ad7d59efd855273f528d6bbffa09ed135b1ee) (cherry picked from commit c55b431345288c428355586f31ad3455be3a2a70) # Conflicts: # deps/rabbit/src/rabbit_node_monitor.erl --- deps/rabbit/src/rabbit_node_monitor.erl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl index fb1e3479c459..8e99fdfecab7 100644 --- a/deps/rabbit/src/rabbit_node_monitor.erl +++ b/deps/rabbit/src/rabbit_node_monitor.erl @@ -994,5 +994,9 @@ startup_log([], PartitionHandling) -> rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode)", [PartitionHandling]); startup_log(Nodes, PartitionHandling) -> rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode), might be partitioned from ~tp", +<<<<<<< HEAD [Nodes, PartitionHandling]). >>>>>>> 864ec125a3 (announce partition handling mechanism in node monitor start-up log) +======= + [PartitionHandling, Nodes]). +>>>>>>> c55b431345 (oops, fix parameter ordering) From bdcbc8dfe5bdf326f6e23d88ee870b9ce720c4eb Mon Sep 17 00:00:00 2001 From: Michael Klishin Date: Mon, 9 Oct 2023 18:48:33 -0400 Subject: [PATCH 4/4] Resolve a conflict (#9668) --- deps/rabbit/src/rabbit_node_monitor.erl | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl index 8e99fdfecab7..28e2c1658df7 100644 --- a/deps/rabbit/src/rabbit_node_monitor.erl +++ b/deps/rabbit/src/rabbit_node_monitor.erl @@ -983,10 +983,6 @@ possibly_partitioned_nodes() -> alive_rabbit_nodes() -- rabbit_nodes:all_running(). startup_log(Nodes) -> -<<<<<<< HEAD - rabbit_log:info("Starting rabbit_node_monitor, might be partitioned from ~p", - [Nodes]). -======= {ok, M} = application:get_env(rabbit, cluster_partition_handling), startup_log(Nodes, M). @@ -994,9 +990,5 @@ startup_log([], PartitionHandling) -> rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode)", [PartitionHandling]); startup_log(Nodes, PartitionHandling) -> rabbit_log:info("Starting rabbit_node_monitor (in ~tp mode), might be partitioned from ~tp", -<<<<<<< HEAD - [Nodes, PartitionHandling]). ->>>>>>> 864ec125a3 (announce partition handling mechanism in node monitor start-up log) -======= [PartitionHandling, Nodes]). ->>>>>>> c55b431345 (oops, fix parameter ordering) +