[ARVADOS] updated: f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd

Git user git at public.curoverse.com
Wed Mar 23 16:32:18 EDT 2016


Summary of changes:
 .../nodemanager/arvnodeman/computenode/dispatch/slurm.py   | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

       via  f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd (commit)
      from  c7539c212173e4ce103ec8cc893f0c4f0a942ba3 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit f49a4d7ff243bb3e8b15f4c5adf77d6355fb6bcd
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Wed Mar 23 16:32:15 2016 -0400

    Don't retry slurm subprocess calls that raise OSError, no issue #

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 4d70436..255e50a 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -36,15 +36,7 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
     def _get_slurm_state(self):
         return subprocess.check_output(['sinfo', '--noheader', '-o', '%t', '-n', self._nodename])
 
-    # The following methods retry on OSError.  This is intended to mitigate bug
-    # #6321 where fork() of node manager raises "OSError: [Errno 12] Cannot
-    # allocate memory" resulting in the untimely death of the shutdown actor
-    # and tends to result in node manager getting into a wedged state where it
-    # won't allocate new nodes or shut down gracefully.  The underlying causes
-    # of the excessive memory usage that result in the "Cannot allocate memory"
-    # error are still being investigated.
-
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     def cancel_shutdown(self, reason):
         if self._nodename:
             if self._get_slurm_state() in self.SLURM_DRAIN_STATES:
@@ -56,14 +48,14 @@ class ComputeNodeShutdownActor(ShutdownActorBase):
                 pass
         return super(ComputeNodeShutdownActor, self).cancel_shutdown(reason)
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     @ShutdownActorBase._stop_if_window_closed
     def issue_slurm_drain(self):
         self._set_node_state('DRAIN', 'Reason=Node Manager shutdown')
         self._logger.info("Waiting for SLURM node %s to drain", self._nodename)
         self._later.await_slurm_drain()
 
-    @RetryMixin._retry((subprocess.CalledProcessError, OSError))
+    @RetryMixin._retry((subprocess.CalledProcessError,))
     @ShutdownActorBase._stop_if_window_closed
     def await_slurm_drain(self):
         output = self._get_slurm_state()

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list