[ARVADOS] updated: 15b49783aa0ac76508986e772b98ffa9d187c57f

Git user git at public.curoverse.com
Thu Feb 9 13:48:48 EST 2017


Summary of changes:
 .../nodemanager/arvnodeman/computenode/dispatch/__init__.py  | 12 ++++++------
 .../nodemanager/arvnodeman/computenode/dispatch/slurm.py     |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

       via  15b49783aa0ac76508986e772b98ffa9d187c57f (commit)
      from  6a7d7a2fa8e217e1ff9440769f39a2095d5bb837 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 15b49783aa0ac76508986e772b98ffa9d187c57f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Thu Feb 9 13:46:14 2017 -0500

    10846: Specify whether to try to resume the node when cancelling shutdown

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 05e3c9e..7a94dae 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -204,7 +204,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
             self.success = success_flag
         return super(ComputeNodeShutdownActor, self)._finished()
 
-    def cancel_shutdown(self, reason):
+    def cancel_shutdown(self, reason, **kwargs):
         self.cancel_reason = reason
         self._logger.info("Shutdown cancelled: %s.", reason)
         self._finished(success_flag=False)
@@ -215,19 +215,19 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
             try:
                 return orig_func(self, *args, **kwargs)
             except Exception as error:
-                self._logger.error("Actor error %s", error, exc_info=True)
+                self._logger.error("Actor error %s", error)
                 self._logger.debug("", exc_info=True)
-                self._later.cancel_shutdown("Unhandled exception %s" % error)
+                self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
         return finish_wrapper
 
     @_cancel_on_exception
     def shutdown_node(self):
         if self.cancellable:
             self._logger.info("Checking that node is still eligible for shutdown")
-            # Check that we still want to shut down the node.
             eligible, reason = self._monitor.shutdown_eligible().get()
             if not eligible:
-                self.cancel_shutdown("No longer eligible for shut down because %s" % reason)
+                self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
+                                     try_resume=True)
                 return
 
         self._logger.info("Starting shutdown")
@@ -239,7 +239,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
             else:
                 self._finished(success_flag=True)
         else:
-            self.cancel_shutdown(self.DESTROY_FAILED)
+            self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
 
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index cae8719..cbeabd1 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -39,9 +39,9 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
             self._later.issue_slurm_drain()
 
     @RetryMixin._retry((subprocess.CalledProcessError,))
-    def cancel_shutdown(self, reason):
+    def cancel_shutdown(self, reason, try_resume=True):
         if self._nodename:
-            if self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
+            if try_resume and self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
                 # Resume from "drng" or "drain"
                 self._set_node_state(self._nodename, 'RESUME')
             else:
@@ -70,8 +70,8 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
             self._timer.schedule(time.time() + 10,
                                  self._later.await_slurm_drain)
         elif output in ("idle\n"):
-            # Not in "drng" so cancel self.
-            self.cancel_shutdown("slurm state is %s" % output.strip())
+            # Not in "drng" but idle, don't shut down
+            self.cancel_shutdown("slurm state is %s" % output.strip(), try_resume=False)
         else:
             # any other state.
             self._later.shutdown_node()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list