[ARVADOS] updated: e13c868ed9216b4ad414adc435a9f9ed5afe2b89

Git user git at public.curoverse.com
Wed Apr 6 15:52:00 EDT 2016


Summary of changes:
 .../nodemanager/arvnodeman/computenode/dispatch/slurm.py     | 12 +++++++-----
 .../nodemanager/tests/test_computenode_dispatch_slurm.py     |  8 ++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

       via  e13c868ed9216b4ad414adc435a9f9ed5afe2b89 (commit)
      from  241ef75ec8b6cf5dd14ce19fa068462adaeb0386 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit e13c868ed9216b4ad414adc435a9f9ed5afe2b89
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Apr 6 15:51:56 2016 -0400

    8799: Nodes in "drain" state are not automatically eligible for shutdown to
    avoid a race between starting a shutdown and resume_node().

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 9ef54b3..845379f 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -72,11 +72,13 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
 class ComputeNodeMonitorActor(SlurmMixin, MonitorActorBase):
 
     def shutdown_eligible(self):
-        if (self.arvados_node is not None and
-            self._get_slurm_state(self.arvados_node['hostname']) in self.SLURM_END_STATES):
-            return True
-        else:
-            return super(ComputeNodeMonitorActor, self).shutdown_eligible()
+        if self.arvados_node is not None:
+            state = self._get_slurm_state(self.arvados_node['hostname'])
+            # Automatically eligible for shutdown if it's down or failed, but
+            # not drain to avoid a race condition with resume_node().
+            if state in self.SLURM_END_STATES and state not in self.SLURM_DRAIN_STATES:
+                return True
+        return super(ComputeNodeMonitorActor, self).shutdown_eligible()
 
     def resume_node(self):
         try:
diff --git a/services/nodemanager/tests/test_computenode_dispatch_slurm.py b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
index 212bb3d..6e03a7d 100644
--- a/services/nodemanager/tests/test_computenode_dispatch_slurm.py
+++ b/services/nodemanager/tests/test_computenode_dispatch_slurm.py
@@ -160,3 +160,11 @@ class SLURMComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
         self.shutdowns._set_state(True, 600)
         self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+
+    @mock.patch("subprocess.check_output")
+    def test_no_shutdown_drain_node(self, check_output):
+        check_output.return_value = "drain\n"
+        self.make_actor()
+        self.assertEquals('shutdown window is not open.', self.node_actor.shutdown_eligible().get(self.TIMEOUT))
+        self.shutdowns._set_state(True, 600)
+        self.assertTrue(self.node_actor.shutdown_eligible().get(self.TIMEOUT))

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list