[ARVADOS] updated: 0d8d66df56992a39cc032ba482e1ff88de7f22ab
git at public.curoverse.com
git at public.curoverse.com
Wed Apr 15 15:14:53 EDT 2015
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 10 +++++----
services/nodemanager/arvnodeman/daemon.py | 11 ++++-----
.../nodemanager/tests/test_computenode_dispatch.py | 6 +++--
services/nodemanager/tests/test_daemon.py | 26 ++++++++++++++++++++--
4 files changed, 40 insertions(+), 13 deletions(-)
via 0d8d66df56992a39cc032ba482e1ff88de7f22ab (commit)
via de34089011627304e8e7588def5f6848311a9843 (commit)
via ddd8d6e3452d2c3ff5193a3988c7b6194134d703 (commit)
from 912464ad82bad38f1ce7984b6d4b19734a9816a9 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 0d8d66df56992a39cc032ba482e1ff88de7f22ab
Merge: 912464a de34089
Author: Brett Smith <brett at curoverse.com>
Date: Wed Apr 15 15:14:29 2015 -0400
Merge branch '5714-gce-setup-bugfixes-wip'
Closes #5714, #5715.
commit de34089011627304e8e7588def5f6848311a9843
Author: Brett Smith <brett at curoverse.com>
Date: Mon Apr 13 16:48:16 2015 -0400
5714: Avoid Node Manager race conditions around stop_if_no_cloud_node.
Checking .is_alive() seems to always lead to race conditions.
Instead, have CloudNodeSetupActor.stop_if_no_cloud_node() return True
if it's going to stop, else False. Have NodeManagerDaemonActor
respect this return value consistently.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index d0a8b0d..0fab1b0 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -139,8 +139,10 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
self._finished()
def stop_if_no_cloud_node(self):
- if self.cloud_node is None:
- self.stop()
+ if self.cloud_node is not None:
+ return False
+ self.stop()
+ return True
class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 836b673..ba52871 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -299,8 +299,7 @@ class NodeManagerDaemonActor(actor_class):
if (nodes_excess < 1) or not self.booting:
return None
for key, node in self.booting.iteritems():
- node.stop_if_no_cloud_node().get()
- if not node.actor_ref.is_alive():
+ if node.stop_if_no_cloud_node().get():
del self.booting[key]
if nodes_excess > 1:
self._later.stop_booting_node()
@@ -345,12 +344,14 @@ class NodeManagerDaemonActor(actor_class):
def shutdown(self):
self._logger.info("Shutting down after signal.")
self.poll_stale_after = -1 # Inhibit starting/stopping nodes
- for bootnode in self.booting.itervalues():
- bootnode.stop_if_no_cloud_node()
+ setup_stops = {key: node.stop_if_no_cloud_node()
+ for key, node in self.booting.iteritems()}
+ self.booting = {key: self.booting[key]
+ for key in setup_stops if not setup_stops[key].get()}
self._later.await_shutdown()
def await_shutdown(self):
- if any(node.actor_ref.is_alive() for node in self.booting.itervalues()):
+ if self.booting:
self._timer.schedule(time.time() + 1, self._later.await_shutdown)
else:
self.stop()
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index b8cf0ee..96a70c6 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -79,14 +79,16 @@ class ComputeNodeSetupActorTestCase(testutil.ActorTestMixin, unittest.TestCase):
self.make_mocks(
arverror.ApiError(httplib2.Response({'status': '500'}), ""))
self.make_actor()
- self.setup_actor.stop_if_no_cloud_node()
+ self.assertTrue(
+ self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
self.assertTrue(
self.setup_actor.actor_ref.actor_stopped.wait(self.TIMEOUT))
def test_no_stop_when_cloud_node(self):
self.make_actor()
self.wait_for_assignment(self.setup_actor, 'cloud_node')
- self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT)
+ self.assertFalse(
+ self.setup_actor.stop_if_no_cloud_node().get(self.TIMEOUT))
self.assertTrue(self.stop_proxy(self.setup_actor),
"actor was stopped by stop_if_no_cloud_node")
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index dc8fdc3..228b552 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -288,6 +288,24 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.stop_proxy(self.daemon)
self.assertTrue(self.last_setup.stop_if_no_cloud_node.called)
+ def test_all_booting_nodes_tried_to_shut_down(self):
+ size = testutil.MockSize(2)
+ self.make_daemon(want_sizes=[size])
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ setup1 = self.last_setup
+ setup1.stop_if_no_cloud_node().get.return_value = False
+ setup1.stop_if_no_cloud_node.reset_mock()
+ self.daemon.update_server_wishlist([size, size]).get(self.TIMEOUT)
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ self.assertIsNot(setup1, self.last_setup)
+ self.last_setup.stop_if_no_cloud_node().get.return_value = True
+ self.last_setup.stop_if_no_cloud_node.reset_mock()
+ self.daemon.update_server_wishlist([]).get(self.TIMEOUT)
+ self.daemon.max_nodes.get(self.TIMEOUT)
+ self.stop_proxy(self.daemon)
+ self.assertEqual(1, self.last_setup.stop_if_no_cloud_node.call_count)
+ self.assertTrue(setup1.stop_if_no_cloud_node.called)
+
def test_shutdown_declined_at_wishlist_capacity(self):
cloud_node = testutil.cloud_node_mock(1)
size = testutil.MockSize(1)
@@ -384,6 +402,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
def test_clean_shutdown_waits_for_node_setup_finish(self):
new_node = self.start_node_boot()
+ new_node.stop_if_no_cloud_node().get.return_value = False
+ new_node.stop_if_no_cloud_node.reset_mock()
self.daemon.shutdown().get(self.TIMEOUT)
self.assertTrue(new_node.stop_if_no_cloud_node.called)
self.daemon.node_up(new_node).get(self.TIMEOUT)
@@ -393,9 +413,11 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.daemon.actor_ref.actor_stopped.wait(self.TIMEOUT))
def test_wishlist_ignored_after_shutdown(self):
- size = testutil.MockSize(2)
- self.make_daemon(want_sizes=[size])
+ new_node = self.start_node_boot()
+ new_node.stop_if_no_cloud_node().get.return_value = False
+ new_node.stop_if_no_cloud_node.reset_mock()
self.daemon.shutdown().get(self.TIMEOUT)
+ size = testutil.MockSize(2)
self.daemon.update_server_wishlist([size] * 2).get(self.TIMEOUT)
self.timer.deliver()
self.stop_proxy(self.daemon)
commit ddd8d6e3452d2c3ff5193a3988c7b6194134d703
Author: Brett Smith <brett at curoverse.com>
Date: Mon Apr 13 15:37:48 2015 -0400
5714: Node Manager setup process retries Arvados errors.
This fixes a regression from 6ab7cf882cd9a268374b880b5e55b4c8946406b4.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 7081762..d0a8b0d 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -104,12 +104,12 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
else:
self._later.prepare_arvados_node(arvados_node)
- @ComputeNodeStateChangeBase._retry()
+ @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
def create_arvados_node(self):
self.arvados_node = self._arvados.nodes().create(body={}).execute()
self._later.create_cloud_node()
- @ComputeNodeStateChangeBase._retry()
+ @ComputeNodeStateChangeBase._retry(config.ARVADOS_ERRORS)
def prepare_arvados_node(self, node):
self.arvados_node = self._arvados.nodes().update(
uuid=node['uuid'],
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list