[ARVADOS] updated: ba534230bcc23aaea69e5f256693b9e143200b63

Git user git at public.curoverse.com
Fri Apr 8 17:25:37 EDT 2016


Summary of changes:
 .../arvnodeman/computenode/dispatch/slurm.py       |  7 ++--
 services/nodemanager/arvnodeman/daemon.py          | 13 ++++++--
 .../nodemanager/tests/test_computenode_dispatch.py |  4 +--
 .../tests/test_computenode_dispatch_slurm.py       | 37 ++++++----------------
 4 files changed, 27 insertions(+), 34 deletions(-)

       via  ba534230bcc23aaea69e5f256693b9e143200b63 (commit)
      from  e13c868ed9216b4ad414adc435a9f9ed5afe2b89 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit ba534230bcc23aaea69e5f256693b9e143200b63
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Apr 8 17:25:25 2016 -0400

    8799: shutdown_eligible() returns "node is draining" when in drain state.  Add comments about iterating over cloud_nodes to check for "down" nodes.  Fix tests.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 845379f..41919db 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -76,8 +76,11 @@ class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
             state = self._get_slurm_state(self.arvados_node['hostname'])
             # Automatically eligible for shutdown if it's down or failed, but
             # not drain to avoid a race condition with resume_node().
-            if state in self.SLURM_END_STATES and state not in self.SLURM_DRAIN_STATES:
-                return True
+            if state in self.SLURM_END_STATES:
+                if state in self.SLURM_DRAIN_STATES:
+                    return "node is draining"
+                else:
+                    return True
         return super(ComputeNodeMonitorActor, self).shutdown_eligible()
 
     def resume_node(self):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 2c2ebe8..a02da23 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -221,9 +221,13 @@ class NodeManagerDaemonActor(actor_class):
                 if cloud_rec.actor.offer_arvados_pair(arv_node).get():
                     self._pair_nodes(cloud_rec, arv_node)
                     break
-        for rec in self.arvados_nodes.nodes.itervalues():
-            if (rec.arvados_node["info"].get("slurm_state") == "down" and
-                rec.cloud_node is not None and
+        for rec in self.cloud_nodes.nodes.itervalues():
+            # crunch-dispatch turns all slurm states that are not either "idle"
+            # or "alloc" into "down", but in case that behavior changes, assume
+            # any state that is not "idle" or "alloc" could be a state we want
+            # to try to resume from.
+            if (rec.arvados_node is not None and
+                rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
                 rec.cloud_node.id not in self.shutdowns):
                 self._resume_node(rec)
 
@@ -247,6 +251,9 @@ class NodeManagerDaemonActor(actor_class):
                   if size is None or c.cloud_node.size.id == size.id)
 
     def _nodes_down(self, size):
+        # Make sure to iterate over self.cloud_nodes because what we're
+        # counting here are compute nodes that are reported by the cloud
+        # provider but are considered "down" by Arvados.
         return sum(1 for down in
                    pykka.get_all(rec.actor.in_state('down') for rec in
                                  self.cloud_nodes.nodes.itervalues()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 6613718..14cd85e 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -361,7 +361,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
@@ -386,7 +386,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(11, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 6e03a7d..135b817 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -119,12 +119,8 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(arv_node=arv_node)
         check_output.return_value = "drain\n"
         self.node_actor.resume_node().get(self.TIMEOUT)
-        check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME'])],
-                                      any_order=True)
-        self.assertEqual(4, check_output.call_count)
+        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+        self.assertIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
 
     @mock.patch("subprocess.check_output")
     def test_no_resume_idle_node(self, check_output):
@@ -132,39 +128,26 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(arv_node=arv_node)
         check_output.return_value = "idle\n"
         self.node_actor.resume_node().get(self.TIMEOUT)
-        check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']])],
-                                      any_order=True)
-        self.assertEqual(3, check_output.call_count)
-
+        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+        self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
 
     @mock.patch("subprocess.check_output")
     def test_resume_node_exception(self, check_output):
         arv_node = testutil.arvados_node_mock()
         self.make_actor(arv_node=arv_node)
         check_output.side_effect = Exception()
-        check_output.return_value = "drain\n"
         self.node_actor.resume_node().get(self.TIMEOUT)
-        check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
-                                       mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']])],
-                                      any_order=True)
-        self.assertEqual(3, check_output.call_count)
-
+        self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+        self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
 
     @mock.patch("subprocess.check_output")
     def test_shutdown_down_node(self, check_output):
         check_output.return_value = "down\n"
-        self.make_actor()
-        self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-        self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.make_actor(arv_node=testutil.arvados_node_mock())
+        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     @mock.patch("subprocess.check_output")
     def test_no_shutdown_drain_node(self, check_output):
         check_output.return_value = "drain\n"
-        self.make_actor()
-        self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-        self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.make_actor(arv_node=testutil.arvados_node_mock())
+        self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list