Make the list of excluded states complete (#645)

JIRA issue: DCOS_OSS-5370
d2iq-archive · Jul 25, 2019 · 755efde · 755efde
1 parent 87c281b
commit 755efde
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 12 deletions.
diff --git a/ci/ci-integration.sh b/ci/ci-integration.sh
@@ -41,7 +41,7 @@ random_string() {
 
 cluster_ids() {
     # dcos cluster command was added into the CLI starting in DC/OS 1.10
-    if [ "${DCOS_VERSION}" != '1.9' ]; then 
+    if [ "${DCOS_VERSION}" != '1.9' ]; then
         if ! dcos cluster list >/dev/null 2>&1; then
             return
         fi
@@ -133,7 +133,7 @@ template_parameters:
     PublicSlaveInstanceCount: 1
     SlaveInstanceCount: 1
 EOF
-    
+
     # DefaultInstanceType parameter has not been backported into 1.9 CF templates.
     # The templates actually hardcode in this parameter instead.
     if [ "${DCOS_VERSION}" != '1.9' ]; then
@@ -149,7 +149,7 @@ EOF
     time wrapped_dcos_launch create
     time wrapped_dcos_launch wait
     wrapped_dcos_launch describe
-    
+
     if [ "${SECURITY_MODE}" == 'disabled' ] || [ "${VARIANT}" == 'open' ]; then
         CLUSTER_URL=http://$(wrapped_dcos_launch describe | jq -r .masters[0].public_ip)
     else
@@ -269,7 +269,7 @@ launch_marathonlb() {
 
     MLB_OPTIONS_JSON="mlb-options.json"
     RENDER_RESPONSE_JSON="response.json"
-    MLB_JSON="mlb.json"    
+    MLB_JSON="mlb.json"
 
     # Strict mode requires having a service account.
     if [ "${SECURITY_MODE}" == 'strict' ]; then
@@ -314,11 +314,11 @@ EOF
     else
         dcos package describe marathon-lb --app --render > $RENDER_RESPONSE_JSON
     fi
-    
+
     cat $RENDER_RESPONSE_JSON | jq --arg DOCKER_IMAGE "$DOCKER_IMAGE" '.container.docker.image=$DOCKER_IMAGE' | jq --arg MLB_VERSION "$MLB_VERSION" '.labels.DCOS_PACKAGE_VERSION=$MLB_VERSION' > $MLB_JSON
     cat $MLB_JSON
 
-    echo "Launching Marathon-lb." 
+    echo "Launching Marathon-lb."
     dcos marathon app add $MLB_JSON
 
     # Sleeping to wait for MLB to deploy.
@@ -348,7 +348,7 @@ check_marathonlb_health() {
         status_line "Marathon-lb stdout:"
         dcos task log marathon-lb stdout --lines=50
 
-        status_line "Marathon-lb stderr:"        
+        status_line "Marathon-lb stderr:"
         dcos task log marathon-lb stderr --lines=50
 
         delete_cluster
@@ -388,4 +388,4 @@ launch_marathonlb
 run_integration_tests
 
 # Deleting cluster through dcos-launch.
-delete_cluster
+delete_cluster
diff --git a/marathon_lb.py b/marathon_lb.py
@@ -1565,15 +1565,61 @@ def get_health_check(app, portIndex):
 healthCheckResultCache = LRUCache()
 
 
+# Last time this variable was adjusted, the task states were extracted from
+# https://github.com/apache/mesos/blob/e58f4b97b5d13ccc18ad9b1632d7e6409bdd0c55/include/mesos/mesos.proto
+#
+# Essentially, only there states, namely, TASK_RUNNING, TASK_LOST, and
+# TASK_UNREACHABLE, are used in HAProxy config generation. And here is why:
+#
+#   - TASK_RUNNING - No comments.
+#   - TASK_LOST - A task is lost as seen by Mesos due to, for example,
+#     a network partition between a Mesos master and the Mesos agent that is
+#     on the same node the task is running on. But the task itself may or
+#     may not be running. In fact, it might transition to TASK_RUNNING state
+#     when the Mesos agent reregisters with the Mesos master. Hence let's
+#     be optimistic here and route trafic to tasks in this state.
+#   - TASK_UNREACHABLE - The same line of reasoning as the one used for
+#     TASK_LOST state, is valid here too.
+EXCLUDED_TASK_STATES = {
+    # A task in this state transitions into another state quickly,
+    # unless there is a bug in Mesos which causes it hang in this state
+    # for quite some time or even forever. If the latter is not the case,
+    # it moves either to a terminal state (for instance, TASK_ERROR),
+    # or TASK_STARTING. In any case, there is no need to pay attention
+    # to it yet.
+    'TASK_STAGING',
+    # A similar reasoning applies here too. A task state becomes either
+    # terminal or `TASK_RUNNING` quickly.
+    'TASK_STARTING',
+    # A task is being killed. No need to route traffic to it.
+    'TASK_KILLING',
+    # A terminal state.
+    'TASK_FINISHED',
+    # A terminal state.
+    'TASK_FAILED',
+    # A terminal state.
+    'TASK_KILLED',
+    # A terminal state.
+    'TASK_ERROR',
+    # A terminal state.
+    'TASK_DROPPED',
+    # A terminal state.
+    'TASK_GONE',
+    # Though it is not a terminal state, let's trust the operator, and
+    # assume that the task is truly gone while it is in this state.
+    'TASK_GONE_BY_OPERATOR',
+    # A task is in some funny state according to Mesos. Let's not route
+    # traffic to it.
+    'TASK_UNKNOWN',
+}
+
+
 def get_apps(marathon, apps=[]):
     if len(apps) == 0:
         apps = marathon.list()
 
     logger.debug("got apps %s", [app["id"] for app in apps])
 
-    excluded_states = {'TASK_KILLING', 'TASK_KILLED',
-                       'TASK_FINISHED', 'TASK_ERROR'}
-
     marathon_apps = []
     # This process requires 2 passes: the first is to gather apps belonging
     # to a deployment group.
@@ -1749,7 +1795,7 @@ def get_apps(marathon, apps=[]):
 
             # 'state' will not be present in test cases.
             # Should always be present in an actual cluster
-            if 'state' in task and task['state'] in excluded_states:
+            if 'state' in task and task['state'] in EXCLUDED_TASK_STATES:
                 logger.warning("Ignoring non-running task " + task['id'] +
                                " with state " + task['state'])
                 continue