[ARVADOS] updated: abfdd429351ad6b457667b624c8ba550ffcac999

Git user git at public.curoverse.com
Fri Apr 15 10:26:51 EDT 2016


Summary of changes:
 .../arvnodeman/computenode/dispatch/__init__.py    | 23 +++++---
 .../arvnodeman/computenode/dispatch/slurm.py       | 13 +++--
 services/nodemanager/arvnodeman/daemon.py          | 12 ----
 .../nodemanager/tests/test_computenode_dispatch.py | 32 +++++-----
 .../tests/test_computenode_dispatch_slurm.py       | 68 ++++++----------------
 services/nodemanager/tests/test_daemon.py          | 22 -------
 6 files changed, 54 insertions(+), 116 deletions(-)

       via  abfdd429351ad6b457667b624c8ba550ffcac999 (commit)
       via  2a13fcc84b240e368787c8e94ced95d75eee0cc4 (commit)
      from  26751323e77005dc158b64e86c47bbb9459e6697 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit abfdd429351ad6b457667b624c8ba550ffcac999
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Apr 15 10:26:46 2016 -0400

    Fixup test_node_undrained_when_shutdown_cancelled and test_alloc_node_undrained_when_shutdown_cancelled.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 33f9de6..9bb7bd0 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -395,7 +395,6 @@ class ComputeNodeMonitorActor(config.actor_class):
         try:
             eligible = self.shutdown_eligible()
             next_opening = self._shutdowns.next_opening()
-            print(self.last_shutdown_opening, next_opening)
             if eligible is True:
                 self._debug("Suggesting shutdown")
                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 6358623..0f98afc 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -60,19 +60,21 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertFalse(proc_mock.called)
 
-    # def test_node_undrained_when_shutdown_window_closes(self, proc_mock):
-    #     proc_mock.side_effect = iter(['drng\n', 'idle\n'])
-    #     self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-    #     self.make_actor()
-    #     self.check_success_flag(False, 2)
-    #     self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
-
-    # def test_alloc_node_undrained_when_shutdown_window_closes(self, proc_mock):
-    #     proc_mock.side_effect = iter(['alloc\n'])
-    #     self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
-    #     self.make_actor()
-    #     self.check_success_flag(False, 2)
-    #     self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
+    def test_node_undrained_when_shutdown_cancelled(self, proc_mock):
+        proc_mock.side_effect = iter(['drng\n', 'idle\n'])
+        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+        self.make_actor()
+        self.shutdown_actor.cancel_shutdown("test")
+        self.check_success_flag(False, 2)
+        self.check_slurm_got_args(proc_mock, 'NodeName=compute99', 'State=RESUME')
+
+    def test_alloc_node_undrained_when_shutdown_cancelled(self, proc_mock):
+        proc_mock.side_effect = iter(['alloc\n'])
+        self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
+        self.make_actor()
+        self.shutdown_actor.cancel_shutdown("test")
+        self.check_success_flag(False, 2)
+        self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
 
     def test_cancel_shutdown_retry(self, proc_mock):
         proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])

commit 2a13fcc84b240e368787c8e94ced95d75eee0cc4
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri Apr 15 08:10:13 2016 -0400

    8953: Tests pass, with some removed due to removal of the corresponding behavior.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index a2d24b3..33f9de6 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -355,8 +355,7 @@ class ComputeNodeMonitorActor(config.actor_class):
             result = result and not self.arvados_node['job_uuid']
         return result
 
-    def consider_shutdown(self):
-        try:
+    def shutdown_eligible(self):
             # Collect states and then consult state transition table
             # whether we should shut down.  Possible states are:
             #
@@ -367,11 +366,12 @@ class ComputeNodeMonitorActor(config.actor_class):
 
             if self.arvados_node is None:
                 crunch_worker_state = 'unpaired'
+            elif not timestamp_fresh(arvados_node_mtime(self.arvados_node), self.node_stale_after):
+                return "node state is stale"
             elif self.arvados_node['crunch_worker_state']:
                 crunch_worker_state = self.arvados_node['crunch_worker_state']
             else:
-                self._debug("Node is paired but crunch_worker_state is null")
-                return
+                return "node is paired but crunch_worker_state is '%s'" % self.arvados_node['crunch_worker_state']
 
             window = "open" if self._shutdowns.window_open() else "closed"
 
@@ -384,12 +384,20 @@ class ComputeNodeMonitorActor(config.actor_class):
             idle_grace = 'idle exceeded'
 
             node_state = (crunch_worker_state, window, boot_grace, idle_grace)
-            self._debug("Considering shutdown, node state is %s", node_state)
-            eligible = transitions[node_state]
+            t = transitions[node_state]
+            self._debug("Node state is %s, transition is %s", node_state , t)
+            if t:
+                return True
+            else:
+                return "node state is %s" % (node_state,)
 
+    def consider_shutdown(self):
+        try:
+            eligible = self.shutdown_eligible()
             next_opening = self._shutdowns.next_opening()
-            if eligible:
-                self._debug("Suggesting shutdown.")
+            print(self.last_shutdown_opening, next_opening)
+            if eligible is True:
+                self._debug("Suggesting shutdown")
                 _notify_subscribers(self.actor_ref.proxy(), self.subscribers)
             elif self.last_shutdown_opening != next_opening:
                 self._debug("Shutdown window closed.  Next at %s.",
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 058afda..b301451 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -60,11 +60,12 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
     @RetryMixin._retry((subprocess.CalledProcessError,))
     def await_slurm_drain(self):
         output = self._get_slurm_state(self._nodename)
-        if output in self.SLURM_END_STATES:
-            self._later.shutdown_node()
-        elif output in ("alloc\n", "idle\n"):
-            # Not in "drng" so cancel self.
-            self.cancel_shutdown("slurm state is idle")
-        else:
+        if output in ("drng\n", "alloc\n"):
             self._timer.schedule(time.time() + 10,
                                  self._later.await_slurm_drain)
+        elif output in ("idle\n"):
+            # Not in "drng" so cancel self.
+            self.cancel_shutdown("slurm state is %s" % output.strip())
+        else:
+            # any other state.
+            self._later.shutdown_node()
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index a02da23..519213f 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -151,9 +151,6 @@ class NodeManagerDaemonActor(actor_class):
     def _update_poll_time(self, poll_key):
         self.last_polls[poll_key] = time.time()
 
-    def _resume_node(self, node_record):
-        node_record.actor.resume_node()
-
     def _pair_nodes(self, node_record, arvados_node):
         self._logger.info("Cloud node %s is now paired with Arvados node %s",
                           node_record.cloud_node.name, arvados_node['uuid'])
@@ -221,15 +218,6 @@ class NodeManagerDaemonActor(actor_class):
                 if cloud_rec.actor.offer_arvados_pair(arv_node).get():
                     self._pair_nodes(cloud_rec, arv_node)
                     break
-        for rec in self.cloud_nodes.nodes.itervalues():
-            # crunch-dispatch turns all slurm states that are not either "idle"
-            # or "alloc" into "down", but in case that behavior changes, assume
-            # any state that is not "idle" or "alloc" could be a state we want
-            # to try to resume from.
-            if (rec.arvados_node is not None and
-                rec.arvados_node["info"].get("slurm_state") not in ("idle", "alloc") and
-                rec.cloud_node.id not in self.shutdowns):
-                self._resume_node(rec)
 
     def _nodes_booting(self, size):
         s = sum(1
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index b722f3c..8bb0c50 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -205,6 +205,7 @@ class ComputeNodeShutdownActorMixin(testutil.ActorTestMixin):
         cloud_node = testutil.cloud_node_mock(61)
         arv_node = testutil.arvados_node_mock(61)
         self.make_mocks(cloud_node, arv_node, shutdown_open=False)
+        self.cloud_client.destroy_node.return_value = False
         self.make_actor(cancellable=True)
         self.shutdown_actor.cancel_shutdown("test")
         self.check_success_flag(False, 2)
@@ -220,14 +221,6 @@ class ComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         self.check_success_flag(True)
         self.assertTrue(self.cloud_client.destroy_node.called)
 
-    def test_shutdown_cancelled_when_window_closes(self):
-        self.make_mocks(shutdown_open=False)
-        self.make_actor()
-        self.check_success_flag(False, 2)
-        self.assertFalse(self.cloud_client.destroy_node.called)
-        self.assertEqual(self.ACTOR_CLASS.WINDOW_CLOSED,
-                         self.shutdown_actor.cancel_reason.get(self.TIMEOUT))
-
     def test_shutdown_retries_when_cloud_fails(self):
         self.make_mocks()
         self.cloud_client.destroy_node.return_value = False
@@ -357,28 +350,29 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_no_shutdown_booting(self):
         self.make_actor()
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is still booting"))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')")
 
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
         self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
-    def test_no_shutdown_missing(self):
+    def test_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
                                               crunch_worker_state="down",
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertIs(self.node_actor.shutdown_eligible().get(self.TIMEOUT), True)
 
-    def test_no_shutdown_running_broken(self):
+    def test_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
                                               crunch_worker_state="down")
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertIs(self.node_actor.shutdown_eligible().get(self.TIMEOUT), True)
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -391,23 +385,27 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("shutdown window is not open."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')")
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          "node state is ('busy', 'open', 'boot wait', 'idle exceeded')")
 
     def test_no_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          "node is paired but crunch_worker_state is 'None'")
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT).startswith("node is not idle."))
+        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
+                          "node state is stale")
 
     def test_arvados_node_match(self):
         self.make_actor(2)
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index a6d6643..6358623 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -41,11 +41,11 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
             self.check_success_after_reset(proc_mock, end_state)
         return test
 
-    for wait_state in ['alloc\n', 'drng\n', 'idle*\n']:
+    for wait_state in ['alloc\n', 'drng\n']:
         locals()['test_wait_while_' + wait_state.strip()
                  ] = make_wait_state_test(start_state=wait_state)
 
-    for end_state in ['down\n', 'down*\n', 'drain\n', 'fail\n']:
+    for end_state in ['idle*\n', 'down\n', 'down*\n', 'drain\n', 'fail\n']:
         locals()['test_wait_until_' + end_state.strip()
                  ] = make_wait_state_test(end_state=end_state)
 
@@ -75,7 +75,7 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
     #     self.check_slurm_got_args(proc_mock, 'sinfo', '--noheader', '-o', '%t', '-n', 'compute99')
 
     def test_cancel_shutdown_retry(self, proc_mock):
-        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n'])
+        proc_mock.side_effect = iter([OSError, 'drain\n', OSError, 'idle\n', 'idle\n'])
         self.make_mocks(arvados_node=testutil.arvados_node_mock(job_uuid=True))
         self.make_actor()
         self.check_success_flag(False, 2)
@@ -88,39 +88,3 @@ class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
         proc_mock.return_value = 'drain\n'
         super(SLURMComputeNodeShutdownActorTestCase,
               self).test_arvados_node_cleaned_after_shutdown()
-
-class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
-                                      unittest.TestCase):
-
-    def make_mocks(self, node_num):
-        self.shutdowns = testutil.MockShutdownTimer()
-        self.shutdowns._set_state(False, 300)
-        self.timer = mock.MagicMock(name='timer_mock')
-        self.updates = mock.MagicMock(name='update_mock')
-        self.cloud_mock = testutil.cloud_node_mock(node_num)
-        self.subscriber = mock.Mock(name='subscriber_mock')
-        self.cloud_client = mock.MagicMock(name='cloud_client')
-        self.cloud_client.broken.return_value = False
-
-    def make_actor(self, node_num=1, arv_node=None, start_time=None):
-        if not hasattr(self, 'cloud_mock'):
-            self.make_mocks(node_num)
-        if start_time is None:
-            start_time = time.time()
-        self.node_actor = slurm_dispatch.ComputeNodeMonitorActor.start(
-            self.cloud_mock, start_time, self.shutdowns,
-            testutil.cloud_node_fqdn, self.timer, self.updates, self.cloud_client,
-            arv_node, boot_fail_after=300).proxy()
-        self.node_actor.subscribe(self.subscriber).get(self.TIMEOUT)
-
-    @mock.patch("subprocess.check_output")
-    def test_shutdown_down_node(self, check_output):
-        check_output.return_value = "down\n"
-        self.make_actor(arv_node=testutil.arvados_node_mock())
-        self.assertIs(True, self.node_actor.shutdown_eligible().get(self.TIMEOUT))
-
-    @mock.patch("subprocess.check_output")
-    def test_no_shutdown_drain_node(self, check_output):
-        check_output.return_value = "drain\n"
-        self.make_actor(arv_node=testutil.arvados_node_mock())
-        self.assertEquals('node is draining', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 00e05a1..3b5f721 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -718,25 +718,3 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         # test for that.
         self.assertEqual(2, sizecounts[small.id])
         self.assertEqual(1, sizecounts[big.id])
-
-    @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
-    def test_resume_drained_nodes(self, resume_node):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-        self.make_daemon([cloud_node], [arv_node])
-        resume_node.assert_called_with(self.daemon.cloud_nodes.get(self.TIMEOUT).nodes.values()[0])
-        self.stop_proxy(self.daemon)
-
-    @mock.patch("arvnodeman.daemon.NodeManagerDaemonActor._resume_node")
-    def test_no_resume_shutdown_nodes(self, resume_node):
-        cloud_node = testutil.cloud_node_mock(1)
-        arv_node = testutil.arvados_node_mock(1, info={"ec2_instance_id": "1", "slurm_state": "down"})
-
-        self.make_daemon([cloud_node], [])
-
-        self.node_shutdown = mock.MagicMock(name='shutdown_mock')
-        self.daemon.shutdowns.get(self.TIMEOUT)[cloud_node.id] = self.node_shutdown
-
-        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
-        self.stop_proxy(self.daemon)
-        resume_node.assert_not_called()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list