[ARVADOS] updated: 560c318fdc49835b03f96af35774fbbfa7984fe7

git at public.curoverse.com git at public.curoverse.com
Tue Sep 29 16:42:14 EDT 2015


Summary of changes:
 .../arvnodeman/computenode/dispatch/__init__.py    |  3 ++
 .../arvnodeman/computenode/dispatch/slurm.py       |  4 +-
 .../nodemanager/tests/test_computenode_dispatch.py | 52 +++++++++++++++++-----
 .../tests/test_computenode_dispatch_slurm.py       |  2 +-
 4 files changed, 48 insertions(+), 13 deletions(-)

       via  560c318fdc49835b03f96af35774fbbfa7984fe7 (commit)
      from  72e3566f2cdacd44f095183ebf88f7aab8b0d8dc (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 560c318fdc49835b03f96af35774fbbfa7984fe7
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Sep 29 16:43:41 2015 -0400

    7286: Tests for new "missing and broken" shutdown policy.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 4557198..70e6e8e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -323,6 +323,9 @@ class ComputeNodeMonitorActor(config.actor_class):
         return result
 
     def shutdown_eligible(self):
+        import logging
+        logging.warn("XXX %s %s", self.arvados_node, self._cloud.broken(self.cloud_node))
+
         if not self._shutdowns.window_open():
             return False
         elif self.arvados_node is None:
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 71e73f1..3c26629 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -15,6 +15,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     def on_start(self):
         arv_node = self._arvados_node()
         if arv_node is None:
+            self._nodename = None
             return super(ComputeNodeShutdownActor, self).on_start()
         else:
             self._nodename = arv_node['hostname']
@@ -29,7 +30,8 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
 
     @ShutdownActorBase._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self):
-        self._set_node_state('RESUME')
+        if self._nodename:
+            self._set_node_state('RESUME')
         return super(ComputeNodeShutdownActor, self).cancel_shutdown()
 
     @ShutdownActorBase._stop_if_window_closed
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index c22e7a0..707cdc6 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -128,12 +128,14 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         self.cloud_node = cloud_node
         self.arvados_node = arvados_node
 
-    def make_actor(self, cancellable=True):
+    def make_actor(self, cancellable=True, start_time=None):
         if not hasattr(self, 'timer'):
             self.make_mocks()
+        if start_time is None:
+            start_time = time.time()
         monitor_actor = dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_node, time.time(), self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates,
+            self.cloud_node, start_time, self.shutdowns,
+            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
             self.arvados_node)
         self.shutdown_actor = self.ACTOR_CLASS.start(
             self.timer, self.cloud_client, self.arvados_client, monitor_actor,
@@ -190,7 +192,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     ACTOR_CLASS = dispatch.ComputeNodeShutdownActor
 
     def test_easy_shutdown(self):
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
@@ -203,7 +205,7 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
         self.cloud_client.destroy_node.return_value = False
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.assertIsNone(self.shutdown_actor.success.get(self.TIMEOUT))
         self.cloud_client.destroy_node.return_value = True
         self.check_success_flag(True)
@@ -241,6 +243,7 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.updates = mock.MagicMock(name='update_mock')
         self.cloud_mock = testutil.cloud_node_mock(node_num)
         self.subscriber = mock.Mock(name='subscriber_mock')
+        self.cloud_client = mock.MagicMock(name='cloud_client')
 
     def make_actor(self, node_num=1, arv_node=None, start_time=None):
         if not hasattr(self, 'cloud_mock'):
@@ -249,8 +252,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
             start_time = time.time()
         self.node_actor = dispatch.ComputeNodeMonitorActor.start(
             self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates,
-            arv_node).proxy()
+            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
+            arv_node, boot_fail_after=300).proxy()
         self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
 
     def node_state(self, *states):
@@ -298,23 +301,50 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertFalse(self.subscriber.called)
 
     def test_shutdown_subscription(self):
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
         self.node_actor.consider_shutdown().get(self.TIMEOUT)
         self.assertTrue(self.subscriber.called)
         self.assertEqual(self.node_actor.actor_ref.actor_urn,
                          self.subscriber.call_args[0][0].actor_ref.actor_urn)
 
-    def test_shutdown_without_arvados_node(self):
+    def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_without_arvados_node_and_old_cloud_node(self):
+    def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    def test_no_shutdown_missing(self):
+        arv_node = testutil.arvados_node_mock(10, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="missing")
+        self.make_actor(10, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = False
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
+    def test_no_shutdown_running_broken(self):
+        arv_node = testutil.arvados_node_mock(12, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="running")
+        self.make_actor(12, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = True
+        self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    def test_shutdown_missing_broken(self):
+        arv_node = testutil.arvados_node_mock(11, job_uuid=None,
+                                              crunch_worker_state="down",
+                                              status="missing")
+        self.make_actor(11, arv_node)
+        self.shutdowns._set_state(True, 600)
+        self.cloud_client.broken.return_value = True
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
         self.assertFalse(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index ac3ebf0..c5097a7 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -55,7 +55,7 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     def test_slurm_bypassed_when_no_arvados_node(self, proc_mock):
         # Test we correctly handle a node that failed to bootstrap.
         proc_mock.return_value = 'idle\n'
-        self.make_actor()
+        self.make_actor(start_time=0)
         self.check_success_flag(True)
         self.assertFalse(proc_mock.called)
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list