[ARVADOS] updated: abfdd429351ad6b457667b624c8ba550ffcac999
Git user
git at public.curoverse.com
Fri Apr 15 10:26:51 EDT 2016
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 23 +++++---
.../arvnodeman/computenode/dispatch/slurm.py | 13 +++--
services/nodemanager/arvnodeman/daemon.py | 12 ----
.../nodemanager/tests/test_computenode_dispatch.py | 32 +++++-----
.../tests/test_computenode_dispatch_slurm.py | 68 ++++++----------------
services/nodemanager/tests/test_daemon.py | 22 -------
6 files changed, 54 insertions(+), 116 deletions(-)
via abfdd429351ad6b457667b624c8ba550ffcac999 (commit)
via 2a13fcc84b240e368787c8e94ced95d75eee0cc4 (commit)
from 26751323e77005dc158b64e86c47bbb9459e6697 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit abfdd429351ad6b457667b624c8ba550ffcac999
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Apr 15 10:26:46 2016 -0400
Fixup test_node_undrained_when_shutdown_cancelled and test_alloc_node_undrained_when_shutdown_cancelled.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 33f9de6..9bb7bd0 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -395,7 +395,6 @@ class ComputeNodeMonitorActor(config.actor_class):
try:
eligible = self.shutdown_eligible()
next_opening = self._shutdowns.next_opening()
- print(self.last_shutdown_opening, next_opening)
if eligible is True:
self._debug("Suggesting shutdown")
_notify_subscribers(self.actor_ref.proxy(), self.subscribers)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 6358623..0f98afc 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -60,19 +60,21 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
self.check_success_flag(True)
self.assertFalse(proc_mock.called)
- # def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
- # proc_mock.side_effect = iter(['drng\n', 'idle\n'])
- # self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- # self.make_actor()
- # self.check_success_flag(False, 2)
- # self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
- # def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
- # proc_mock.side_effect = iter(['alloc\n'])
- # self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
- # self.make_actor()
- # self.check_success_flag(False, 2)
- # self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+ def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+ proc_mock.side_effect = iter(['drng\n', 'idle\n'])
+ self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+ self.make_actor()
+ self.shutdown_actor.cancel_shutdown("test")
+ self.check_success_flag(False, 2)
+ self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
+
+ def test_alloc_node_undrained_when_shutdown_cancelled(self, proc_mock):
+ proc_mock.side_effect = iter(['alloc\n'])
+ self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+ self.make_actor()
+ self.shutdown_actor.cancel_shutdown("test")
+ self.check_success_flag(False, 2)
+ self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
def test_cancel_shutdown_retry(self, proc_mock):
proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
commit 2a13fcc84b240e368787c8e94ced95d75eee0cc4
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Apr 15 08:10:13 2016 -0400
8953: Tests pass, with some removed due to removal of the corresponding behavior.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index a2d24b3..33f9de6 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -355,8 +355,7 @@ class ComputeNodeMonitorActor(config.actor_class):
result = result and not self.arvados_node['job_uuid']
return result
- def consider_shutdown(self):
- try:
+ def shutdown_eligible(self):
# Collect states and then consult state transition table
# whether we should shut down. Possible states are:
#
@@ -367,11 +366,12 @@ class ComputeNodeMonitorActor(config.actor_class):
if self.arvados_node is None:
crunch_worker_state = 'unpaired'
+ elif not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+ return "node state is stale"
elif self.arvados_node['crunch_worker_state']:
crunch_worker_state = self.arvados_node['crunch_worker_state']
else:
- self._debug("Node is paired but crunch_worker_state is null")
- return
+ return "node is paired but crunch_worker_state is '%s'" % self.arvados_node['crunch_worker_state']
window = "open" if self._shutdowns.window_open() else "closed"
@@ -384,12 +384,20 @@ class ComputeNodeMonitorActor(config.actor_class):
idle_grace = 'idle exceeded'
node_state = (crunch_worker_state, window, boot_grace, idle_grace)
- self._debug("Considering shutdown, node state is %s", node_state)
- eligible = transitions[node_state]
+ t = transitions[node_state]
+ self._debug("Node state is %s, transition is %s", node_state , t)
+ if t:
+ return True
+ else:
+ return "node state is %s" % (node_state,)
+ def consider_shutdown(self):
+ try:
+ eligible = self.shutdown_eligible()
next_opening = self._shutdowns.next_opening()
- if eligible:
- self._debug("Suggesting shutdown.")
+ print(self.last_shutdown_opening, next_opening)
+ if eligible is True:
+ self._debug("Suggesting shutdown")
_notify_subscribers(self.actor_ref.proxy(), self.subscribers)
elif self.last_shutdown_opening != next_opening:
self._debug("Shutdown window closed. Next at %s.",
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 058afda..b301451 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -60,11 +60,12 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
@RetryMixin._retry((subprocess.CalledProcessError,))
def await_slurm_drain(self):
output = self._get_slurm_state(self._nodename)
- if output in self.SLURM_END_STATES:
- self._later.shutdown_node()
- elif output in ("alloc\n", "idle\n"):
- # Not in "drng" so cancel self.
- self.cancel_shutdown("slurm state is idle")
- else:
+ if output in ("drng\n", "alloc\n"):
self._timer.schedule(time.time() + 10,
self._later.await_slurm_drain)
+ elif output in ("idle\n"):
+ # Not in "drng" so cancel self.
+ self.cancel_shutdown("slurm state is %s" % output.strip())
+ else:
+ # any other state.
+ self._later.shutdown_node()
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index a02da23..519213f 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -151,9 +151,6 @@ class NodeManagerDaemonActor(actor_class):
def _update_poll_time(self, poll_key):
self.last_polls[poll_key] = time.time()
- def _resume_node(self, node_record):
- node_record.actor.resume_node()
-
def _pair_nodes(self, node_record, arvados_node):
self._logger.info("Cloud node %s is now paired with Arvados node %s",
node_record.cloud_node.name, arvados_node['uuid'])
@@ -221,15 +218,6 @@ class NodeManagerDaemonActor(actor_class):
if cloud_rec.actor.offer_arvados_pair(arv_node).get():
self._pair_nodes(cloud_rec, arv_node)
break
- for rec in self.cloud_nodes.nodes.itervalues():
- # crunch-dispatch turns all slurm states that are not either "idle"
- # or "alloc" into "down", but in case that behavior changes, assume
- # any state that is not "idle" or "alloc" could be a state we want
- # to try to resume from.
- if (rec.arvados_node is not None and
- rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
- rec.cloud_node.id not in self.shutdowns):
- self._resume_node(rec)
def _nodes_booting(self, size):
s = sum(1
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index b722f3c..8bb0c50 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -205,6 +205,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
cloud_node = testutil.cloud_node_mock(61)
arv_node = testutil.arvados_node_mock(61)
self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+ self.cloud_client.destroy_node.return_value = False
self.make_actor(cancellable=True)
self.shutdown_actor.cancel_shutdown("test")
self.check_success_flag(False, 2)
@@ -220,14 +221,6 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
self.check_success_flag(True)
self.assertTrue(self.cloud_client.destroy_node.called)
- def test_shutdown_cancelled_when_window_closes(self):
- self.make_mocks(shutdown_open=False)
- self.make_actor()
- self.check_success_flag(False, 2)
- self.assertFalse(self.cloud_client.destroy_node.called)
- self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
- self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
def test_shutdown_retries_when_cloud_fails(self):
self.make_mocks()
self.cloud_client.destroy_node.return_value = False
@@ -357,28 +350,29 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_no_shutdown_booting(self):
self.make_actor()
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')")
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- def test_no_shutdown_missing(self):
+ def test_shutdown_missing(self):
arv_node = testutil.arvados_node_mock(10, job_uuid=None,
crunch_worker_state="down",
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertIs(self.node_actor.shutdown_eligible().get(self.TIMEOUT), True)
- def test_no_shutdown_running_broken(self):
+ def test_shutdown_running_broken(self):
arv_node = testutil.arvados_node_mock(12, job_uuid=None,
crunch_worker_state="down")
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertIs(self.node_actor.shutdown_eligible().get(self.TIMEOUT), True)
def test_shutdown_missing_broken(self):
arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -391,23 +385,27 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')")
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ "node state is ('busy', 'open', 'boot wait', 'idle exceeded')")
def test_no_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state=None))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ "node is paired but crunch_worker_state is 'None'")
def test_no_shutdown_when_node_state_stale(self):
self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+ "node state is stale")
def test_arvados_node_match(self):
self.make_actor(2)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index a6d6643..6358623 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -41,11 +41,11 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
self.check_success_after_reset(proc_mock, end_state)
return test
- for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+ for wait_state in ['alloc\n', 'drng\n']:
locals()['test_wait_while_' + wait_state.strip()
] = make_wait_state_test(start_state=wait_state)
- for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+ for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
locals()['test_wait_until_' + end_state.strip()
] = make_wait_state_test(end_state=end_state)
@@ -75,7 +75,7 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
# self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
def test_cancel_shutdown_retry(self, proc_mock):
- proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+ proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
self.make_actor()
self.check_success_flag(False, 2)
@@ -88,39 +88,3 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
proc_mock.return_value = 'drain\n'
super(SLURMComputeNodeShutdownActorTestCase,
self).test_arvados_node_cleaned_after_shutdown()
-
-class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
- unittest.TestCase):
-
- def make_mocks(self, node_num):
- self.shutdowns = testutil.MockShutdownTimer()
- self.shutdowns._set_state(False, 300)
- self.timer = mock.MagicMock(name='timer_mock')
- self.updates = mock.MagicMock(name='update_mock')
- self.cloud_mock = testutil.cloud_node_mock(node_num)
- self.subscriber = mock.Mock(name='subscriber_mock')
- self.cloud_client = mock.MagicMock(name='cloud_client')
- self.cloud_client.broken.return_value = False
-
- def make_actor(self, node_num=1, arv_node=None, start_time=None):
- if not hasattr(self, 'cloud_mock'):
- self.make_mocks(node_num)
- if start_time is None:
- start_time = time.time()
- self.node_actor = slurm_dispatch.ComputeNodeMonitorActor.start(
- self.cloud_mock, start_time, self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
- arv_node, boot_fail_after=300).proxy()
- self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-
- @mock.patch("subprocess.check_output")
- def test_shutdown_down_node(self, check_output):
- check_output.return_value = "down\n"
- self.make_actor(arv_node=testutil.arvados_node_mock())
- self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
- @mock.patch("subprocess.check_output")
- def test_no_shutdown_drain_node(self, check_output):
- check_output.return_value = "drain\n"
- self.make_actor(arv_node=testutil.arvados_node_mock())
- self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 00e05a1..3b5f721 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -718,25 +718,3 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
# test for that.
self.assertEqual(2, sizecounts[small.id])
self.assertEqual(1, sizecounts[big.id])
-
- @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
- def test_resume_drained_nodes(self, resume_node):
- cloud_node = testutil.cloud_node_mock(1)
- arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
- self.make_daemon([cloud_node], [arv_node])
- resume_node.assert_called_with(self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values()[0])
- self.stop_proxy(self.daemon)
-
- @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
- def test_no_resume_shutdown_nodes(self, resume_node):
- cloud_node = testutil.cloud_node_mock(1)
- arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-
- self.make_daemon([cloud_node], [])
-
- self.node_shutdown = mock.MagicMock(name='shutdown_mock')
- self.daemon.shutdowns.get(self.TIMEOUT)[cloud_node.id] = self.node_shutdown
-
- self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
- self.stop_proxy(self.daemon)
- resume_node.assert_not_called()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list