[ARVADOS] updated: f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd
Git user
git at public.curoverse.com
Wed Mar 23 16:32:18 EDT 2016
Summary of changes:
.../nodemanager/arvnodeman/computenode/dispatch/slurm.py | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
via f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd (commit)
from c7539c212173e4ce103ec8cc893f0c4f0a942ba3 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Wed Mar 23 16:32:15 2016 -0400
Don't retry slurm subprocess calls that raise OSError, no issue #
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 4d70436..255e50a 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -36,15 +36,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
def _get_slurm_state(self):
return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
- # The following methods retry on OSError. This is intended to mitigate bug
- # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
- # allocate memory" resulting in the untimely death of the shutdown actor
- # and tends to result in node manager getting into a wedged state where it
- # won't allocate new nodes or shut down gracefully. The underlying causes
- # of the excessive memory usage that result in the "Cannot allocate memory"
- # error are still being investigated.
-
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+ @RetryMixin._retry((subprocess.CalledProcessError,))
def cancel_shutdown(self, reason):
if self._nodename:
if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
@@ -56,14 +48,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
pass
return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+ @RetryMixin._retry((subprocess.CalledProcessError,))
@ShutdownActorBase._stop_if_window_closed
def issue_slurm_drain(self):
self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
self._later.await_slurm_drain()
- @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+ @RetryMixin._retry((subprocess.CalledProcessError,))
@ShutdownActorBase._stop_if_window_closed
def await_slurm_drain(self):
output = self._get_slurm_state()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list