[ARVADOS] created: 04354c91038a80430ab80572f774486757a31a21
Git user
git at public.curoverse.com
Wed Apr 6 11:22:46 EDT 2016
at 04354c91038a80430ab80572f774486757a31a21 (commit)
commit 04354c91038a80430ab80572f774486757a31a21
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Apr 6 11:22:31 2016 -0400
8799: Nodes with slurm_state of "drng" or "drain" that don't have active
ShutdownActors are put back into idle state in daemon.update_arvados_nodes.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 2ddfb0a..f2dbc3e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -295,6 +295,8 @@ class ComputeNodeUpdateActor(config.actor_class):
def sync_node(self, cloud_node, arvados_node):
return self._cloud.sync_node(cloud_node, arvados_node)
+ def reenable_node(arvados_node):
+ pass
class ComputeNodeMonitorActor(config.actor_class):
"""Actor to manage a running compute node.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 255e50a..fe811e0 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -8,6 +8,7 @@ import time
from . import \
ComputeNodeSetupActor, ComputeNodeUpdateActor, ComputeNodeMonitorActor
from . import ComputeNodeShutdownActor as ShutdownActorBase
+from . import ComputeNodeUpdateActor as UpdateActorBase
from .. import RetryMixin
class ComputeNodeShutdownActor(ShutdownActorBase):
@@ -64,3 +65,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
else:
self._timer.schedule(time.time() + 10,
self._later.await_slurm_drain)
+
+
+class ComputeNodeUpdateActor(UpdateActorBase):
+ def reenable_node(self, arv_node):
+ try:
+ subprocess.check_call(['scontrol', 'update',
+ 'NodeName=' + arv_node['hostname'],
+ 'State=idle'])
+ except Exception as error:
+ self._logger.warn(
+ "Subprocess exception: %s", error, exc_info=error)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 7976f21..96661e9 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -218,6 +218,11 @@ class NodeManagerDaemonActor(actor_class):
if cloud_rec.actor.offer_arvados_pair(arv_node).get():
self._pair_nodes(cloud_rec, arv_node)
break
+ for rec in self.arvados_nodes.nodes.itervalues():
+ if (rec.arvados_node["info"].get("slurm_state") in ("drng", "drain") and
+ rec.cloud_node is not None and
+ rec.cloud_node.id not in self.shutdowns):
+ self._cloud_updater.reenable_node(rec.arvados_node)
def _nodes_booting(self, size):
s = sum(1
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 8648783..80d35ba 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -87,3 +87,25 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
proc_mock.return_value = 'drain\n'
super(SLURMComputeNodeShutdownActorTestCase,
self).test_arvados_node_cleaned_after_shutdown()
+
+class SLURMComputeNodeUpdateActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
+ def make_actor(self):
+ self.driver = mock.MagicMock(name='driver_mock')
+ self.updater = slurm_dispatch.ComputeNodeUpdateActor.start(self.driver).proxy()
+
+ @mock.patch("subprocess.check_call")
+ def test_reenable_node(self, subproc):
+ self.make_actor()
+ cloud_node = testutil.cloud_node_mock()
+ arv_node = testutil.arvados_node_mock()
+ self.updater.reenable_node(arv_node).get(self.TIMEOUT)
+ subproc.assert_called_with(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=idle'])
+
+ @mock.patch("subprocess.check_call")
+ def test_reenable_node_exception(self, subproc):
+ self.make_actor()
+ cloud_node = testutil.cloud_node_mock()
+ arv_node = testutil.arvados_node_mock()
+ subproc.side_effect = Exception()
+ self.updater.reenable_node(arv_node).get(self.TIMEOUT)
+ subproc.assert_called_with(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=idle'])
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 2daca08..5f34c3f 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -718,3 +718,23 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
# test for that.
self.assertEqual(2, sizecounts[small.id])
self.assertEqual(1, sizecounts[big.id])
+
+ def test_reenable_drained_nodes(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state":"drain"})
+ self.make_daemon([cloud_node], [arv_node])
+ self.stop_proxy(self.daemon)
+ self.cloud_updates.reenable_node.assert_called_with(arv_node)
+
+ def test_no_reenable_shutdown_nodes(self):
+ cloud_node = testutil.cloud_node_mock(1)
+ arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state":"drain"})
+
+ self.make_daemon([cloud_node], [])
+
+ self.node_shutdown = mock.MagicMock(name='shutdown_mock')
+ self.daemon.shutdowns.get()[cloud_node.id] = self.node_shutdown
+
+ self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.cloud_updates.reenable_node.assert_not_called()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list