[ARVADOS] created: 9c95a4c2dcd650627d524513e1e18596c8533ac0
git at public.curoverse.com
git at public.curoverse.com
Tue Oct 6 16:41:15 EDT 2015
at 9c95a4c2dcd650627d524513e1e18596c8533ac0 (commit)
commit 9c95a4c2dcd650627d524513e1e18596c8533ac0
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Oct 6 16:42:51 2015 -0400
6142: If self._set_node_state('RESUME') in cancel_shutdown() returns non-zero,
check the node state and only retry if the node is in 'drain' or 'draining'.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 225d856..bb397fa 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -13,6 +13,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
SLURM_END_STATES = frozenset(['down\n', 'down*\n',
'drain\n', 'drain*\n',
'fail\n', 'fail*\n'])
+ SLURM_DRAIN_STATES = frozenset(['drain\n', 'drng\n'])
def on_start(self):
arv_node = self._arvados_node()
@@ -30,10 +31,26 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
cmd.extend(args)
subprocess.check_output(cmd)
+ def _get_slurm_state(self):
+ return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
+
@ShutdownActorBase._retry((subprocess.CalledProcessError,))
def cancel_shutdown(self):
if self._nodename:
- self._set_node_state('RESUME')
+ try:
+ self._set_node_state('RESUME')
+ except subprocess.CalledProcessError:
+ slum_state = self._get_slurm_state()
+ if slum_state in self.SLURM_DRAIN_STATES:
+ # We expect to be able to resume from "drain" or "drng"
+ # So if scontrol exited non-zero, something actually failed, so
+ # raise an exception to signal the retry to kick in.
+ raise
+ else:
+ # Assume scontrol exited non-zero because the node is already in
+ # 'idle' or 'alloc' (so it never started draining)
+ # we don't need to do anything else resume it.
+ pass
return super(ComputeNodeShutdownActor, self).cancel_shutdown()
@ShutdownActorBase._stop_if_window_closed
@@ -46,8 +63,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
@ShutdownActorBase._stop_if_window_closed
@ShutdownActorBase._retry((subprocess.CalledProcessError,))
def await_slurm_drain(self):
- output = subprocess.check_output(
- ['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
+ output = self._get_slurm_state()
if output in self.SLURM_END_STATES:
self._later.shutdown_node()
else:
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list