[ARVADOS] created: ca2421b9d76782171c17abb2583bead4b7b6b821
git at public.curoverse.com
git at public.curoverse.com
Fri Oct 2 11:07:32 EDT 2015
at ca2421b9d76782171c17abb2583bead4b7b6b821 (commit)
commit ca2421b9d76782171c17abb2583bead4b7b6b821
Author: Brett Smith <brett at curoverse.com>
Date: Fri Oct 2 11:07:27 2015 -0400
7435: Node Manager stops trying to shut down delisted cloud nodes.
If the underlying node is gone, trying to destroy it in the cloud will
almost certainly fail. It's hard to predict what will happen to
related actions like draining the node in SLURM. Just cancel the
attempt, and trust other systems like SLURM and Crunch to deal with
the disappearance on their own.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 44f1513..6b2e0af 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -180,9 +180,11 @@ class NodeManagerDaemonActor(actor_class):
self._pair_nodes(record, arv_rec.arvados_node)
break
for key, record in self.cloud_nodes.orphans.iteritems():
+ if key in self.shutdowns:
+ self.shutdowns[key].stop()
+ del self.shutdowns[key]
record.actor.stop()
record.cloud_node = None
- self.shutdowns.pop(key, None)
def update_arvados_nodes(self, nodelist):
self._update_poll_time('arvados_nodes')
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index b406f13..2fbdbb6 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -459,3 +459,12 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.timer.deliver()
self.stop_proxy(self.daemon)
self.assertEqual(1, self.node_setup.start.call_count)
+
+ def test_shutdown_actor_canceled_when_cloud_node_delisted(self):
+ self.make_daemon(cloud_nodes=[testutil.cloud_node_mock()])
+ self.assertEqual(1, self.alive_monitor_count())
+ monitor = self.monitor_list()[0].proxy()
+ self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+ self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.node_shutdown.start().proxy().stop.call_count)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list