[ARVADOS] created: 04354c91038a80430ab80572f774486757a31a21

Git user git at public.curoverse.com
Wed Apr 6 11:22:46 EDT 2016


        at  04354c91038a80430ab80572f774486757a31a21 (commit)


commit 04354c91038a80430ab80572f774486757a31a21
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Apr 6 11:22:31 2016 -0400

    8799: Nodes with slurm_state of "drng" or "drain" that don't have active
    ShutdownActors are put back into idle state in daemon.update_arvados_nodes.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 2ddfb0a..f2dbc3e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -295,6 +295,8 @@ class ComputeNodeUpdateActor(config.actor_class):
     def sync_node(self, cloud_node, arvados_node):
         return self._cloud.sync_node(cloud_node, arvados_node)
 
+    def reenable_node(arvados_node):
+        pass
 
 class ComputeNodeMonitorActor(config.actor_class):
     """Actor to manage a running compute node.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 255e50a..fe811e0 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -8,6 +8,7 @@ import time
 from . import \
     ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor
 from . import ComputeNodeShutdownActor as ShutdownActorBase
+from . import ComputeNodeUpdateActor as UpdateActorBase
 from .. import RetryMixin
 
 class ComputeNodeShutdownActor(ShutdownActorBase):
@@ -64,3 +65,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
         else:
             self._timer.schedule(time.time() + 10,
                                  self._later.await_slurm_drain)
+
+
+class ComputeNodeUpdateActor(UpdateActorBase):
+    def reenable_node(self, arv_node):
+        try:
+            subprocess.check_call(['scontrol', 'update',
+                                   'NodeName=' + arv_node['hostname'],
+                                   'State=idle'])
+        except Exception as error:
+            self._logger.warn(
+                "Subprocess exception: %s", error, exc_info=error)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7976f21..96661e9 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -218,6 +218,11 @@ class NodeManagerDaemonActor(actor_class):
                 if cloud_rec.actor.offer_arvados_pair(arv_node).get():
                     self._pair_nodes(cloud_rec, arv_node)
                     break
+        for rec in self.arvados_nodes.nodes.itervalues():
+            if (rec.arvados_node["info"].get("slurm_state") in ("drng", "drain") and
+                rec.cloud_node is not None and
+                rec.cloud_node.id not in self.shutdowns):
+                self._cloud_updater.reenable_node(rec.arvados_node)
 
     def _nodes_booting(self, size):
         s = sum(1
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 8648783..80d35ba 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -87,3 +87,25 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         proc_mock.return_value = 'drain\n'
         super(SLURMComputeNodeShutdownActorTestCase,
               self).test_arvados_node_cleaned_after_shutdown()
+
+class SLURMComputeNodeUpdateActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+    def make_actor(self):
+        self.driver = mock.MagicMock(name='driver_mock')
+        self.updater = slurm_dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+    @mock.patch("subprocess.check_call")
+    def test_reenable_node(self, subproc):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        self.updater.reenable_node(arv_node).get(self.TIMEOUT)
+        subproc.assert_called_with(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=idle'])
+
+    @mock.patch("subprocess.check_call")
+    def test_reenable_node_exception(self, subproc):
+        self.make_actor()
+        cloud_node = testutil.cloud_node_mock()
+        arv_node = testutil.arvados_node_mock()
+        subproc.side_effect = Exception()
+        self.updater.reenable_node(arv_node).get(self.TIMEOUT)
+        subproc.assert_called_with(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=idle'])
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 2daca08..5f34c3f 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -718,3 +718,23 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         # test for that.
         self.assertEqual(2, sizecounts[small.id])
         self.assertEqual(1, sizecounts[big.id])
+
+    def test_reenable_drained_nodes(self):
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state":"drain"})
+        self.make_daemon([cloud_node], [arv_node])
+        self.stop_proxy(self.daemon)
+        self.cloud_updates.reenable_node.assert_called_with(arv_node)
+
+    def test_no_reenable_shutdown_nodes(self):
+        cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state":"drain"})
+
+        self.make_daemon([cloud_node], [])
+
+        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
+        self.daemon.shutdowns.get()[cloud_node.id] = self.node_shutdown
+
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.cloud_updates.reenable_node.assert_not_called()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list