[ARVADOS] updated: c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67
Git user
git at public.curoverse.com
Fri May 13 16:40:36 EDT 2016
Summary of changes:
services/nodemanager/arvnodeman/daemon.py | 49 ++++++++++++++++++++++---------
services/nodemanager/tests/test_daemon.py | 10 +++++++
2 files changed, 45 insertions(+), 14 deletions(-)
via c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67 (commit)
via 99b0c7c39b4941440f0fb8013abcbbdd9ccd12ea (commit)
from caacfc031998dc73cd2f4c767e1a746b7783d379 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri May 13 16:36:02 2016 -0400
9161: Put nodes tagged _nodemanager_recently_booted nodes back into the node list.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 40ac5dd..3ad2d43 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -214,11 +214,13 @@ class NodeManagerDaemonActor(actor_class):
# A recently booted node is a node that successfully completed the
# setup actor but has not yet appeared in the cloud node list.
# This will have the tag _nodemanager_recently_booted on it, which
- # means we don't want to forget about it yet. Once it appears in
- # the cloud list, the object in record.cloud_node will be replaced
- # by a new one that lacks the "_nodemanager_recently_booted" tag.
- # However if node is being shut down, forget about it.
- if (not hasattr(record.cloud_node, "_nodemanager_recently_booted")) or shutdown:
+ # means (if we're not shutting it down) we want to put it back into
+ # the cloud node list. Once it really appears in the cloud list,
+ # the object in record.cloud_node will be replaced by a new one
+ # that lacks the "_nodemanager_recently_booted" tag.
+ if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+ self.cloud_nodes.add(record)
+ else:
record.actor.stop()
record.cloud_node = None
@@ -459,11 +461,20 @@ class NodeManagerDaemonActor(actor_class):
shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
cloud_node_id = cloud_node.id
shutdown_actor.stop()
+
if not success:
if cancel_reason == self._node_shutdown.NODE_BROKEN:
self.cloud_nodes.blacklist(cloud_node_id)
del self.shutdowns[cloud_node_id]
del self.sizes_booting_shutdown[cloud_node_id]
+ else:
+ # If the node went from being booted to being shut down without ever
+ # appearing in the cloud node list, it will have the
+ # _nodemanager_recently_booted tag, so get rid of it so that the node
+ # can be forgotten completely.
+ if hasattr(self.cloud_nodes[cloud_node_id].cloud_node, "_nodemanager_recently_booted"):
+ del self.cloud_nodes[cloud_node_id].cloud_node._nodemanager_recently_booted
+
# On success, we want to leave the entry in self.shutdowns so that it
# won't try to shut down the node again. It should disappear from the
# cloud node list, and the entry in self.shutdowns will get cleaned up
commit 99b0c7c39b4941440f0fb8013abcbbdd9ccd12ea
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Fri May 13 16:09:10 2016 -0400
9161: Add _nodemanager_recently_booted as new way of remembering nodes which are in intermediate state between being created and showing up in the cloud node list.
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 366c1f8..40ac5dd 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -202,15 +202,25 @@ class NodeManagerDaemonActor(actor_class):
self.try_pairing()
for key, record in self.cloud_nodes.orphans.iteritems():
- if key in self.shutdowns:
+ shutdown = key in self.shutdowns
+ if shutdown:
try:
self.shutdowns[key].stop().get()
except pykka.ActorDeadError:
pass
del self.shutdowns[key]
del self.sizes_booting_shutdown[key]
- record.actor.stop()
- record.cloud_node = None
+
+ # A recently booted node is a node that successfully completed the
+ # setup actor but has not yet appeared in the cloud node list.
+ # This will have the tag _nodemanager_recently_booted on it, which
+ # means we don't want to forget about it yet. Once it appears in
+ # the cloud list, the object in record.cloud_node will be replaced
+ # by a new one that lacks the "_nodemanager_recently_booted" tag.
+ # However if node is being shut down, forget about it.
+ if (not hasattr(record.cloud_node, "_nodemanager_recently_booted")) or shutdown:
+ record.actor.stop()
+ record.cloud_node = None
def _register_arvados_node(self, key, arv_node):
self._logger.info("Registering new Arvados node %s", key)
@@ -230,7 +240,6 @@ class NodeManagerDaemonActor(actor_class):
self._pair_nodes(record, arv_rec.arvados_node)
break
-
def _nodes_booting(self, size):
s = sum(1
for c in self.booting.iterkeys()
@@ -242,24 +251,24 @@ class NodeManagerDaemonActor(actor_class):
for c in self.cloud_nodes.unpaired()
if size is None or c.cloud_node.size.id == size.id)
- def _nodes_paired(self, size):
- return sum(1
- for c in self.cloud_nodes.paired()
- if size is None or c.cloud_node.size.id == size.id)
-
def _nodes_down(self, size):
# Make sure to iterate over self.cloud_nodes because what we're
# counting here are compute nodes that are reported by the cloud
# provider but are considered "down" by Arvados.
return sum(1 for down in
pykka.get_all(rec.actor.in_state('down') for rec in
- self.cloud_nodes.nodes.itervalues()
+ self.cloud_nodes.paired()
if ((size is None or rec.cloud_node.size.id == size.id) and
rec.cloud_node.id not in self.shutdowns))
if down)
+ def _nodes_size(self, size):
+ return sum(1
+ for c in self.cloud_nodes.nodes.itervalues()
+ if size is None or c.cloud_node.size.id == size.id)
+
def _nodes_up(self, size):
- up = (self._nodes_booting(size) + self._nodes_unpaired(size) + self._nodes_paired(size)) - (self._nodes_down(size) + self._size_shutdowns(size))
+ up = (self._nodes_booting(size) + self._nodes_size(size)) - (self._nodes_down(size) + self._size_shutdowns(size))
return up
def _total_price(self):
@@ -286,7 +295,7 @@ class NodeManagerDaemonActor(actor_class):
if size is None or self.sizes_booting_shutdown[c].id == size.id)
def _nodes_wanted(self, size):
- total_up_count = self._nodes_up(None) + self._nodes_down(None)
+ total_up_count = self._nodes_booting(None) + self._nodes_size(None)
under_min = self.min_nodes - total_up_count
over_max = total_up_count - self.max_nodes
total_price = self._total_price()
@@ -298,12 +307,12 @@ class NodeManagerDaemonActor(actor_class):
up_count = self._nodes_up(size)
booting_count = self._nodes_booting(size)
+ total_count = self._nodes_size(size)
unpaired_count = self._nodes_unpaired(size)
- paired_count = self._nodes_paired(size)
busy_count = self._nodes_busy(size)
down_count = self._nodes_down(size)
- idle_count = paired_count - (busy_count+down_count)
shutdown_count = self._size_shutdowns(size)
+ idle_count = total_count - (unpaired_count+busy_count+down_count+shutdown_count)
self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.name,
self._size_wishlist(size),
@@ -398,6 +407,7 @@ class NodeManagerDaemonActor(actor_class):
# successful and so there isn't anything to do.
if cloud_node is not None:
# Node creation succeeded. Update cloud node list.
+ cloud_node._nodemanager_recently_booted = True
self._register_cloud_node(cloud_node)
del self.booting[setup_proxy.actor_ref.actor_urn]
del self.sizes_booting_shutdown[setup_proxy.actor_ref.actor_urn]
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 73b69d0..b6557d0 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -317,6 +317,16 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
self.assertEqual(1, self.alive_monitor_count())
+ def test_node_counted_after_boot_with_slow_listing(self):
+ # Test that, after we boot a compute node, we assume it exists
+ # even it doesn't appear in the listing (e.g., because of delays
+ # propagating tags).
+ setup = self.start_node_boot()
+ self.daemon.node_up(setup).get(self.TIMEOUT)
+ self.assertEqual(1, self.alive_monitor_count())
+ self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
+ self.assertEqual(1, self.alive_monitor_count())
+
def test_booted_unlisted_node_counted(self):
setup = self.start_node_boot(id_num=1)
self.daemon.node_up(setup)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list