[ARVADOS] created: 1.1.1-228-gba6e96d

Git user git at public.curoverse.com
Tue Dec 12 13:35:46 EST 2017


        at  ba6e96dfc00cc6ce4f6ab299bdbff616a523ce90 (commit)


commit ba6e96dfc00cc6ce4f6ab299bdbff616a523ce90
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Tue Dec 12 13:35:21 2017 -0500

    12614: Make node manager aware of "fail" node state.
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
index 3a398a5..93f50c1 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/transitions.py
@@ -53,4 +53,17 @@ transitions = {
  ('unpaired', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
  ('unpaired', 'open', 'boot wait', 'idle exceeded'): None,
  ('unpaired', 'open', 'boot wait', 'idle wait'): None,
- ('unpaired', 'open', 'boot wait', 'not idle'): None}
+ ('unpaired', 'open', 'boot wait', 'not idle'): None,
+
+ ('fail', 'closed', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('fail', 'closed', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('fail', 'closed', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('fail', 'closed', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('fail', 'closed', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('fail', 'closed', 'boot wait', 'not idle'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot exceeded', 'idle exceeded'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot exceeded', 'idle wait'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot exceeded', 'not idle'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot wait', 'idle exceeded'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot wait', 'idle wait'): "START_SHUTDOWN",
+ ('fail', 'open', 'boot wait', 'not idle'): "START_SHUTDOWN"}
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index e06ec83..70ad54d 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -39,8 +39,8 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
                              'mix',   'mix*',
                              'drng',  'drng*'):
                     nodestates[nodename] = 'busy'
-                elif state == 'idle':
-                    nodestates[nodename] = 'idle'
+                elif state in ('idle', 'fail'):
+                    nodestates[nodename] = state
                 else:
                     nodestates[nodename] = 'down'
             except ValueError:
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index e4037d1..4b35205 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -444,6 +444,13 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertEquals((True, "node state is ('idle', 'open', 'boot wait', 'idle exceeded')"),
                           self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
+    def test_shutdown_when_node_state_fail(self):
+        self.make_actor(5, testutil.arvados_node_mock(
+            5, crunch_worker_state='fail'))
+        self.shutdowns._set_state(True, 600)
+        self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list