From f579a62258ff406add338ac1ca7ee2198686f4cb Mon Sep 17 00:00:00 2001 From: xyuanlu Date: Mon, 11 Sep 2023 16:26:26 -0700 Subject: [PATCH] change operation orth to enable --- .../helix/constants/InstanceConstants.java | 4 +- .../rebalancer/DelayedAutoRebalancer.java | 15 ++-- .../rebalancer/util/DelayedRebalanceUtil.java | 2 +- .../apache/helix/model/InstanceConfig.java | 22 +----- .../rebalancer/TestInstanceOperation.java | 68 +++++++++++++++++-- .../rest/server/TestPerInstanceAccessor.java | 20 +----- 6 files changed, 73 insertions(+), 58 deletions(-) diff --git a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java index 379bbaf022..e2cc2de2d5 100644 --- a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java +++ b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java @@ -12,8 +12,6 @@ public enum InstanceDisabledType { public enum InstanceOperation { EVACUATE, // Node will be removed after a period of time SWAP_IN, // New node joining for swap operation - SWAP_OUT, // Existing Node to be removed for swap operation - ENABLE, // Backward compatible field for HELIX_ENABLED. Set when changing from disabled to enabled. - DISABLE // Backward compatible field for HELIX_ENABLED. Set when changing from enabled to disabled. + SWAP_OUT // Existing Node to be removed for swap operation } } diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java index ad36b50195..7f7073963c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java @@ -109,14 +109,12 @@ public IdealState computeNewIdealState(String resourceName, allNodes = clusterData.getAllInstances(); } - Set activeNodes = liveEnabledNodes; + long delay = DelayedRebalanceUtil.getRebalanceDelay(currentIdealState, clusterConfig); + Set activeNodes = DelayedRebalanceUtil + .getActiveNodes(allNodes, currentIdealState, liveEnabledNodes, + clusterData.getInstanceOfflineTimeMap(), clusterData.getLiveInstances().keySet(), + clusterData.getInstanceConfigMap(), delay, clusterConfig); if (delayRebalanceEnabled) { - long delay = DelayedRebalanceUtil.getRebalanceDelay(currentIdealState, clusterConfig); - activeNodes = DelayedRebalanceUtil - .getActiveNodes(allNodes, currentIdealState, liveEnabledNodes, - clusterData.getInstanceOfflineTimeMap(), clusterData.getLiveInstances().keySet(), - clusterData.getInstanceConfigMap(), delay, clusterConfig); - Set offlineOrDisabledInstances = new HashSet<>(activeNodes); offlineOrDisabledInstances.removeAll(liveEnabledNodes); DelayedRebalanceUtil.setRebalanceScheduler(currentIdealState.getResourceName(), true, @@ -165,7 +163,8 @@ public IdealState computeNewIdealState(String resourceName, .computePartitionAssignment(allNodeList, liveEnabledNodeList, currentMapping, clusterData); ZNRecord finalMapping = newIdealMapping; - if (DelayedRebalanceUtil.isDelayRebalanceEnabled(currentIdealState, clusterConfig)) { + if (DelayedRebalanceUtil.isDelayRebalanceEnabled(currentIdealState, clusterConfig) + || liveEnabledNodeList.size()!= activeNodes.size()) { List activeNodeList = new ArrayList<>(activeNodes); Collections.sort(activeNodeList); int minActiveReplicas = DelayedRebalanceUtil.getMinActiveReplica( diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java index e7ff99765d..0575cb9170 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java @@ -129,7 +129,7 @@ private static Set getActiveNodes(Set allNodes, Set live return activeNodes; } - private static Set filterOutEvacuatingInstances(Map instanceConfigMap, + public static Set filterOutEvacuatingInstances(Map instanceConfigMap, Set nodes) { return nodes.stream() .filter(instance -> !instanceConfigMap.get(instance).getInstanceOperation().equals( diff --git a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java index 193019d0e7..45e0476ba0 100644 --- a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java @@ -265,15 +265,6 @@ public boolean getInstanceEnabled() { */ public void setInstanceEnabled(boolean enabled) { // set instance operation only when we need to change InstanceEnabled value. - // When enabling an instance where current HELIX_ENABLED is false, we update INSTANCE_OPERATION to 'ENABLE' - // When disabling and instance where current HELIX_ENABLED is false, we overwrite what current operation and - // update INSTANCE_OPERATION to 'DISABLE'. - String instanceOperationKey = InstanceConfigProperty.INSTANCE_OPERATION.toString(); - if (enabled != getInstanceEnabled()) { - _record.setSimpleField(instanceOperationKey, - enabled ? InstanceConstants.InstanceOperation.ENABLE.name() - : InstanceConstants.InstanceOperation.DISABLE.name()); - } setInstanceEnabledHelper(enabled); } @@ -344,17 +335,8 @@ public long getInstanceEnabledTime() { } public void setInstanceOperation(InstanceConstants.InstanceOperation operation) { - if (operation != InstanceConstants.InstanceOperation.DISABLE - && operation != InstanceConstants.InstanceOperation.ENABLE) { - if (!getInstanceEnabled()) { - throw new HelixException( - "setting non enable/disable operation (e.g. evacuate, swap) to helix disabled instance is not allowed"); - } - } else { - setInstanceEnabledHelper(operation == InstanceConstants.InstanceOperation.ENABLE); - } - - _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.toString(), operation.toString()); + _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), + operation == null ? "" : operation.name()); } public String getInstanceOperation() { diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java index 2276ab1c37..f65657324a 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java @@ -125,7 +125,7 @@ public void testRevertEvacuation() throws Exception { // revert an evacuate instance String instanceToEvacuate = _participants.get(0).getInstanceName(); _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -138,6 +138,55 @@ public void testRevertEvacuation() throws Exception { } @Test(dependsOnMethods = "testRevertEvacuation") + public void testAddingNodeWithEvacuationTag() throws Exception { + // first disable and instance, and wait for all replicas to be moved out + String mockNewInstance = _participants.get(0).getInstanceName(); + _gSetupTool.getClusterManagementTool() + .enableInstance(CLUSTER_NAME, mockNewInstance, false); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + //ev should contain all instances but the disabled one + Map assignment = getEV(); + List currentActiveInstances = + _participantNames.stream().filter(n -> !n.equals(mockNewInstance)).collect(Collectors.toList()); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + Set newPAssignedParticipants = getParticipantsInEv(assignment.get(resource)); + Assert.assertFalse(newPAssignedParticipants.contains(mockNewInstance)); + Assert.assertTrue(newPAssignedParticipants.containsAll(currentActiveInstances)); + } + + // add evacuate tag and enable instance + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, mockNewInstance, InstanceConstants.InstanceOperation.EVACUATE); + _gSetupTool.getClusterManagementTool() + .enableInstance(CLUSTER_NAME, mockNewInstance, true); + //ev should be the same + assignment = getEV(); + currentActiveInstances = + _participantNames.stream().filter(n -> !n.equals(mockNewInstance)).collect(Collectors.toList()); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + Set newPAssignedParticipants = getParticipantsInEv(assignment.get(resource)); + Assert.assertFalse(newPAssignedParticipants.contains(mockNewInstance)); + Assert.assertTrue(newPAssignedParticipants.containsAll(currentActiveInstances)); + } + + // now remove operation tag + String instanceToEvacuate = _participants.get(0).getInstanceName(); + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + // EV should contain all participants, check resources one by one + assignment = getEV(); + for (String resource : _allDBs) { + Assert.assertTrue(getParticipantsInEv(assignment.get(resource)).containsAll(_participantNames)); + validateAssignmentInEv(assignment.get(resource)); + } + } + + @Test(dependsOnMethods = "testAddingNodeWithEvacuationTag") public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { // add a resource where downward state transition is slow createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", PARTITIONS, REPLICA, @@ -151,7 +200,7 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { Assert.assertTrue(_clusterVerifier.verifyByPolling()); // set bootstrap ST delay to a large number - _stateModelDelay = -300000L; + _stateModelDelay = -10000L; // evacuate an instance String instanceToEvacuate = _participants.get(0).getInstanceName(); _gSetupTool.getClusterManagementTool() @@ -174,9 +223,9 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { validateAssignmentInEv(assignment.get(resource)); } - // cancel the evacuation by setting instance operation back to `ENABLE` + // cancel the evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); assignment = getEV(); for (String resource : _allDBs) { @@ -200,7 +249,7 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { public void testEvacuateAndCancelBeforeDropFinish() throws Exception { // set DROP ST delay to a large number - _stateModelDelay = 300000L; + _stateModelDelay = 10000L; // evacuate an instance String instanceToEvacuate = _participants.get(0).getInstanceName(); @@ -211,8 +260,9 @@ public void testEvacuateAndCancelBeforeDropFinish() throws Exception { TestHelper.verify( () -> ((_dataAccessor.getChildNames(_dataAccessor.keyBuilder().messages(instanceToEvacuate))).isEmpty()), 30000); + // cancel evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); // check every replica has >= 3 active replicas, even before cluster converge Map assignment = getEV(); for (String resource : _allDBs) { @@ -290,6 +340,10 @@ private void addParticipant(String participantName) { } private void createTestDBs(long delayTime) throws InterruptedException { + createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB0_CRUSHED", + BuiltInStateModelDefinitions.LeaderStandby.name(), PARTITIONS, REPLICA, REPLICA - 1, -1, + CrushEdRebalanceStrategy.class.getName()); + _allDBs.add("TEST_DB0_CRUSHED"); createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB1_CRUSHED", BuiltInStateModelDefinitions.LeaderStandby.name(), PARTITIONS, REPLICA, REPLICA - 1, 200, CrushEdRebalanceStrategy.class.getName()); @@ -375,7 +429,7 @@ public StDelayMSStateModel() { private void sleepWhileNotCanceled(long sleepTime) throws InterruptedException{ while(sleepTime >0 && !isCancelled()) { Thread.sleep(5000); - sleepTime =- 5000; + sleepTime = sleepTime - 5000; } if (isCancelled()) { _cancelled = false; diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java index d721c372a7..e4c058b09c 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java @@ -496,28 +496,10 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=INVALIDOP") .expectedReturnStatusCode(Response.Status.NOT_FOUND.getStatusCode()).format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation") - .expectedReturnStatusCode(Response.Status.BAD_REQUEST.getStatusCode()).format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); - // TODO: enable the following test when we add sanity check. - // set operation to be DISABLE - new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=DISABLE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.DISABLE.toString()); - Assert.assertTrue(!instanceConfig.getInstanceEnabled()); - - // set operation to EVACUATE, expect error - new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") - .expectedReturnStatusCode(Response.Status.BAD_REQUEST.getStatusCode()) - .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); - // set back to enable - new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=ENABLE") - .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); - instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.ENABLE.toString()); - Assert.assertTrue(instanceConfig.getInstanceEnabled()); - + instanceConfig.getInstanceOperation(), ""); System.out.println("End test :" + TestHelper.getTestMethodName()); }