[ARVADOS] updated: 15b49783aa0ac76508986e772b98ffa9d187c57f
Git user
git at public.curoverse.com
Thu Feb 9 13:48:48 EST 2017
Summary of changes:
.../nodemanager/arvnodeman/computenode/dispatch/__init__.py | 12 ++++++------
.../nodemanager/arvnodeman/computenode/dispatch/slurm.py | 8 ++++----
2 files changed, 10 insertions(+), 10 deletions(-)
via 15b49783aa0ac76508986e772b98ffa9d187c57f (commit)
from 6a7d7a2fa8e217e1ff9440769f39a2095d5bb837 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 15b49783aa0ac76508986e772b98ffa9d187c57f
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Thu Feb 9 13:46:14 2017 -0500
10846: Specify whether to try to resume the node when cancelling shutdown
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 05e3c9e..7a94dae 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -204,7 +204,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
self.success = success_flag
return super(ComputeNodeShutdownActor, self)._finished()
- def cancel_shutdown(self, reason):
+ def cancel_shutdown(self, reason, **kwargs):
self.cancel_reason = reason
self._logger.info("Shutdown cancelled: %s.", reason)
self._finished(success_flag=False)
@@ -215,19 +215,19 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
try:
return orig_func(self, *args, **kwargs)
except Exception as error:
- self._logger.error("Actor error %s", error, exc_info=True)
+ self._logger.error("Actor error %s", error)
self._logger.debug("", exc_info=True)
- self._later.cancel_shutdown("Unhandled exception %s" % error)
+ self._later.cancel_shutdown("Unhandled exception %s" % error, try_resume=False)
return finish_wrapper
@_cancel_on_exception
def shutdown_node(self):
if self.cancellable:
self._logger.info("Checking that node is still eligible for shutdown")
- # Check that we still want to shut down the node.
eligible, reason = self._monitor.shutdown_eligible().get()
if not eligible:
- self.cancel_shutdown("No longer eligible for shut down because %s" % reason)
+ self.cancel_shutdown("No longer eligible for shut down because %s" % reason,
+ try_resume=True)
return
self._logger.info("Starting shutdown")
@@ -239,7 +239,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
else:
self._finished(success_flag=True)
else:
- self.cancel_shutdown(self.DESTROY_FAILED)
+ self.cancel_shutdown(self.DESTROY_FAILED, try_resume=False)
@ComputeNodeStateChangeBase._finish_on_exception
@RetryMixin._retry(config.ARVADOS_ERRORS)
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index cae8719..cbeabd1 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -39,9 +39,9 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
self._later.issue_slurm_drain()
@RetryMixin._retry((subprocess.CalledProcessError,))
- def cancel_shutdown(self, reason):
+ def cancel_shutdown(self, reason, try_resume=True):
if self._nodename:
- if self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
+ if try_resume and self._get_slurm_state(self._nodename) in self.SLURM_DRAIN_STATES:
# Resume from "drng" or "drain"
self._set_node_state(self._nodename, 'RESUME')
else:
@@ -70,8 +70,8 @@ class ComputeNodeShutdownActor(SlurmMixin, ShutdownActorBase):
self._timer.schedule(time.time() + 10,
self._later.await_slurm_drain)
elif output in ("idle\n"):
- # Not in "drng" so cancel self.
- self.cancel_shutdown("slurm state is %s" % output.strip())
+ # Not in "drng" but idle, don't shut down
+ self.cancel_shutdown("slurm state is %s" % output.strip(), try_resume=False)
else:
# any other state.
self._later.shutdown_node()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list