[ARVADOS] created: ca2421b9d76782171c17abb2583bead4b7b6b821

git at public.curoverse.com git at public.curoverse.com
Fri Oct 2 11:07:32 EDT 2015


        at  ca2421b9d76782171c17abb2583bead4b7b6b821 (commit)


commit ca2421b9d76782171c17abb2583bead4b7b6b821
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Oct 2 11:07:27 2015 -0400

    7435: Node Manager stops trying to shut down delisted cloud nodes.
    
    If the underlying node is gone, trying to destroy it in the cloud will
    almost certainly fail.  It's hard to predict what will happen to
    related actions like draining the node in SLURM.  Just cancel the
    attempt, and trust other systems like SLURM and Crunch to deal with
    the disappearance on their own.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 44f1513..6b2e0af 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -180,9 +180,11 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
         for key, record in self.cloud_nodes.orphans.iteritems():
+            if key in self.shutdowns:
+                self.shutdowns[key].stop()
+                del self.shutdowns[key]
             record.actor.stop()
             record.cloud_node = None
-            self.shutdowns.pop(key, None)
 
     def update_arvados_nodes(self, nodelist):
         self._update_poll_time('arvados_nodes')
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index b406f13..2fbdbb6 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -459,3 +459,12 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.timer.deliver()
         self.stop_proxy(self.daemon)
         self.assertEqual(1, self.node_setup.start.call_count)
+
+    def test_shutdown_actor_canceled_when_cloud_node_delisted(self):
+        self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
+        self.assertEqual(1, self.alive_monitor_count())
+        monitor = self.monitor_list()[0].proxy()
+        self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_shutdown.start().proxy().stop.call_count)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list