[ARVADOS] updated: 0f74b22fb1b64f983debb78d7946eb26728fcc52
git at public.curoverse.com
git at public.curoverse.com
Fri Oct 16 11:39:57 EDT 2015
Summary of changes:
.../nodemanager/arvnodeman/computenode/dispatch/slurm.py | 14 +++++++++++---
.../nodemanager/tests/test_computenode_dispatch_slurm.py | 9 +++++++++
2 files changed, 20 insertions(+), 3 deletions(-)
via 0f74b22fb1b64f983debb78d7946eb26728fcc52 (commit)
via eb220f290840b6b28e1979972a3a672dbf17b117 (commit)
via e0f940ec72a9ed000185196f8d01419302b3cb59 (commit)
via 1e74db260a84317d58969b9b530d0d87a325da9c (commit)
from 807e4cd4abee760736fa32704785d673b0e908cd (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 0f74b22fb1b64f983debb78d7946eb26728fcc52
Merge: 807e4cd eb220f2
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Oct 16 11:42:01 2015 -0400
Merge branch '6321-slurm-oserror' closes #6321
commit eb220f290840b6b28e1979972a3a672dbf17b117
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri Oct 16 11:40:36 2015 -0400
6321: Add note about rationale for retrying on OSError.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index b4ed088..ec5014e 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -34,6 +34,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
def _get_slurm_state(self):
return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
+ # The following methods retry on OSError. This is intended to mitigate bug
+ # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
+ # allocate memory" resulting in the untimely death of the shutdown actor
+ # and tends to result in node manager getting into a wedged state where it
+ # won't allocate new nodes or shut down gracefully. The underlying causes
+ # of the excessive memory usage that result in the "Cannot allocate memory"
+ # error are still being investigated.
+
@ShutdownActorBase._retry((subprocess.CalledProcessError, OSError))
def cancel_shutdown(self):
if self._nodename:
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list