[ARVADOS] updated: 560c318fdc49835b03f96af35774fbbfa7984fe7
git at public.curoverse.com
git at public.curoverse.com
Tue Sep 29 16:42:14 EDT 2015
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 3 ++
.../arvnodeman/computenode/dispatch/slurm.py | 4 +-
.../nodemanager/tests/test_computenode_dispatch.py | 52 +++++++++++++++++-----
.../tests/test_computenode_dispatch_slurm.py | 2 +-
4 files changed, 48 insertions(+), 13 deletions(-)
via 560c318fdc49835b03f96af35774fbbfa7984fe7 (commit)
from 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 560c318fdc49835b03f96af35774fbbfa7984fe7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Sep 29 16:43:41 2015 -0400
7286: Tests for new "missing and broken" shutdown policy.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..70e6e8e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -323,6 +323,9 @@ class ComputeNodeMonitorActor(config.actor_class):
return result
def shutdown_eligible(self):
+ import logging
+ logging.warn("XXX %s %s", self.arvados_node, self._cloud.broken(self.cloud_node))
+
if not self._shutdowns.window_open():
return False
elif self.arvados_node is None:
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 71e73f1..3c26629 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -15,6 +15,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
def on_start(self):
arv_node = self._arvados_node()
if arv_node is None:
+ self._nodename = None
return super(ComputeNodeShutdownActor, self).on_start()
else:
self._nodename = arv_node['hostname']
@@ -29,7 +30,8 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
@ShutdownActorBase._retry((subprocess.CalledProcessError,))
def cancel_shutdown(self):
- self._set_node_state('RESUME')
+ if self._nodename:
+ self._set_node_state('RESUME')
return super(ComputeNodeShutdownActor, self).cancel_shutdown()
@ShutdownActorBase._stop_if_window_closed
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index c22e7a0..707cdc6 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -128,12 +128,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
self.cloud_node = cloud_node
self.arvados_node = arvados_node
- def make_actor(self, cancellable=True):
+ def make_actor(self, cancellable=True, start_time=None):
if not hasattr(self, 'timer'):
self.make_mocks()
+ if start_time is None:
+ start_time = time.time()
monitor_actor = dispatch.ComputeNodeMonitorActor.start(
- self.cloud_node, time.time(), self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates,
+ self.cloud_node, start_time, self.shutdowns,
+ testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
self.arvados_node)
self.shutdown_actor = self.ACTOR_CLASS.start(
self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -190,7 +192,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
def test_easy_shutdown(self):
- self.make_actor()
+ self.make_actor(start_time=0)
self.check_success_flag(True)
self.assertTrue(self.cloud_client.destroy_node.called)
@@ -203,7 +205,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
def test_shutdown_retries_when_cloud_fails(self):
self.make_mocks()
self.cloud_client.destroy_node.return_value = False
- self.make_actor()
+ self.make_actor(start_time=0)
self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
self.cloud_client.destroy_node.return_value = True
self.check_success_flag(True)
@@ -241,6 +243,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.updates = mock.MagicMock(name='update_mock')
self.cloud_mock = testutil.cloud_node_mock(node_num)
self.subscriber = mock.Mock(name='subscriber_mock')
+ self.cloud_client = mock.MagicMock(name='cloud_client')
def make_actor(self, node_num=1, arv_node=None, start_time=None):
if not hasattr(self, 'cloud_mock'):
@@ -249,8 +252,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
start_time = time.time()
self.node_actor = dispatch.ComputeNodeMonitorActor.start(
self.cloud_mock, start_time, self.shutdowns,
- testutil.cloud_node_fqdn, self.timer, self.updates,
- arv_node).proxy()
+ testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+ arv_node, boot_fail_after=300).proxy()
self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
def node_state(self, *states):
@@ -298,23 +301,50 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
self.assertFalse(self.subscriber.called)
def test_shutdown_subscription(self):
- self.make_actor()
+ self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
self.node_actor.consider_shutdown().get(self.TIMEOUT)
self.assertTrue(self.subscriber.called)
self.assertEqual(self.node_actor.actor_ref.actor_urn,
self.subscriber.call_args[0][0].actor_ref.actor_urn)
- def test_shutdown_without_arvados_node(self):
+ def test_no_shutdown_booting(self):
self.make_actor()
self.shutdowns._set_state(True, 600)
- self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
- def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
+ def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+ def test_no_shutdown_missing(self):
+ arv_node = testutil.arvados_node_mock(10, job_uuid=None,
+ crunch_worker_state="down",
+ status="missing")
+ self.make_actor(10, arv_node)
+ self.shutdowns._set_state(True, 600)
+ self.cloud_client.broken.return_value = False
self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+ def test_no_shutdown_running_broken(self):
+ arv_node = testutil.arvados_node_mock(12, job_uuid=None,
+ crunch_worker_state="down",
+ status="running")
+ self.make_actor(12, arv_node)
+ self.shutdowns._set_state(True, 600)
+ self.cloud_client.broken.return_value = True
+ self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+ def test_shutdown_missing_broken(self):
+ arv_node = testutil.arvados_node_mock(11, job_uuid=None,
+ crunch_worker_state="down",
+ status="missing")
+ self.make_actor(11, arv_node)
+ self.shutdowns._set_state(True, 600)
+ self.cloud_client.broken.return_value = True
+ self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index ac3ebf0..c5097a7 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -55,7 +55,7 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
# Test we correctly handle a node that failed to bootstrap.
proc_mock.return_value = 'idle\n'
- self.make_actor()
+ self.make_actor(start_time=0)
self.check_success_flag(True)
self.assertFalse(proc_mock.called)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list