[ARVADOS] updated: ba534230bcc23aaea69e5f256693b9e143200b63
Git user
git at public.curoverse.com
Fri Apr 8 17:25:37 EDT 2016
Summary of changes:
.../arvnodeman/computenode/dispatch/slurm.py | 7 ++--
services/nodemanager/arvnodeman/daemon.py | 13 ++++++--
.../nodemanager/tests/test_computenode_dispatch.py | 4 +--
.../tests/test_computenode_dispatch_slurm.py | 37 ++++++----------------
4 files changed, 27 insertions(+), 34 deletions(-)
via ba534230bcc23aaea69e5f256693b9e143200b63 (commit)
from e13c868ed9216b4ad414adc435a9f9ed5afe2b89 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit ba534230bcc23aaea69e5f256693b9e143200b63
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Apr 8 17:25:25 2016 -0400
8799: shutdown_eligible() returns "node is draining" when in drain state. Add comments about iterating over cloud_nodes to check for "down" nodes. Fix tests.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 845379f..41919db 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -76,8 +76,11 @@ class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
state = self._get_slurm_state(self.arvados_node['hostname'])
# Automatically eligible for shutdown if it's down or failed, but
# not drain to avoid a race condition with resume_node().
- if state in self.SLURM_END_STATES and state not in self.SLURM_DRAIN_STATES:
- return True
+ if state in self.SLURM_END_STATES:
+ if state in self.SLURM_DRAIN_STATES:
+ return "node is draining"
+ else:
+ return True
return super(ComputeNodeMonitorActor, self).shutdown_eligible()
def resume_node(self):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 2c2ebe8..a02da23 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -221,9 +221,13 @@ class NodeManagerDaemonActor(actor_class):
if cloud_rec.actor.offer_arvados_pair(arv_node).get():
self._pair_nodes(cloud_rec, arv_node)
break
- for rec in self.arvados_nodes.nodes.itervalues():
- if (rec.arvados_node["info"].get("slurm_state") == "down" and
- rec.cloud_node is not None and
+ for rec in self.cloud_nodes.nodes.itervalues():
+ # crunch-dispatch turns all slurm states that are not either "idle"
+ # or "alloc" into "down", but in case that behavior changes, assume
+ # any state that is not "idle" or "alloc" could be a state we want
+ # to try to resume from.
+ if (rec.arvados_node is not None and
+ rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
rec.cloud_node.id not in self.shutdowns):
self._resume_node(rec)
@@ -247,6 +251,9 @@ class NodeManagerDaemonActor(actor_class):
if size is None or c.cloud_node.size.id == size.id)
def _nodes_down(self, size):
+ # Make sure to iterate over self.cloud_nodes because what we're
+ # counting here are compute nodes that are reported by the cloud
+ # provider but are considered "down" by Arvados.
return sum(1 for down in
pykka.get_all(rec.actor.in_state('down') for rec in
self.cloud_nodes.nodes.itervalues()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 6613718..14cd85e 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -361,7 +361,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_missing(self):
arv_node = testutil.arvados_node_mock(10, job_uuid=None,
@@ -386,7 +386,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 6e03a7d..135b817 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -119,12 +119,8 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.make_actor(arv_node=arv_node)
check_output.return_value = "drain\n"
self.node_actor.resume_node().get(self.TIMEOUT)
- check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME'])],
- any_order=True)
- self.assertEqual(4, check_output.call_count)
+ self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+ self.assertIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
@mock.patch("subprocess.check_output")
def test_no_resume_idle_node(self, check_output):
@@ -132,39 +128,26 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.make_actor(arv_node=arv_node)
check_output.return_value = "idle\n"
self.node_actor.resume_node().get(self.TIMEOUT)
- check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']])],
- any_order=True)
- self.assertEqual(3, check_output.call_count)
-
+ self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+ self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
@mock.patch("subprocess.check_output")
def test_resume_node_exception(self, check_output):
arv_node = testutil.arvados_node_mock()
self.make_actor(arv_node=arv_node)
check_output.side_effect = Exception()
- check_output.return_value = "drain\n"
self.node_actor.resume_node().get(self.TIMEOUT)
- check_output.assert_has_calls([mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]),
- mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']])],
- any_order=True)
- self.assertEqual(3, check_output.call_count)
-
+ self.assertIn(mock.call(['sinfo', '--noheader', '-o', '%t', '-n', arv_node['hostname']]), check_output.call_args_list)
+ self.assertNotIn(mock.call(['scontrol', 'update', 'NodeName=' + arv_node['hostname'], 'State=RESUME']), check_output.call_args_list)
@mock.patch("subprocess.check_output")
def test_shutdown_down_node(self, check_output):
check_output.return_value = "down\n"
- self.make_actor()
- self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.make_actor(arv_node=testutil.arvados_node_mock())
+ self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
@mock.patch("subprocess.check_output")
def test_no_shutdown_drain_node(self, check_output):
check_output.return_value = "drain\n"
- self.make_actor()
- self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.make_actor(arv_node=testutil.arvados_node_mock())
+ self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list