[ARVADOS] updated: c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67

Fri May 13 16:40:36 EDT 2016

Summary of changes:
 services/nodemanager/arvnodeman/daemon.py | 49 ++++++++++++++++++++++---------
 services/nodemanager/tests/test_daemon.py | 10 +++++++
 2 files changed, 45 insertions(+), 14 deletions(-)

       via  c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67 (commit)
       via  99b0c7c39b4941440f0fb8013abcbbdd9ccd12ea (commit)
      from  caacfc031998dc73cd2f4c767e1a746b7783d379 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit c581b66ad0b54d2e57e4c92da9b04af0dbe4ac67
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri May 13 16:36:02 2016 -0400

    9161: Put nodes tagged _nodemanager_recently_booted nodes back into the node list.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 40ac5dd..3ad2d43 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -214,11 +214,13 @@ class NodeManagerDaemonActor(actor_class):
             # A recently booted node is a node that successfully completed the
             # setup actor but has not yet appeared in the cloud node list.
             # This will have the tag _nodemanager_recently_booted on it, which
-            # means we don't want to forget about it yet.  Once it appears in
-            # the cloud list, the object in record.cloud_node will be replaced
-            # by a new one that lacks the "_nodemanager_recently_booted" tag.
-            # However if node is being shut down, forget about it.
-            if (not hasattr(record.cloud_node, "_nodemanager_recently_booted")) or shutdown:
+            # means (if we're not shutting it down) we want to put it back into
+            # the cloud node list.  Once it really appears in the cloud list,
+            # the object in record.cloud_node will be replaced by a new one
+            # that lacks the "_nodemanager_recently_booted" tag.
+            if hasattr(record.cloud_node, "_nodemanager_recently_booted"):
+                self.cloud_nodes.add(record)
+            else:
                 record.actor.stop()
                 record.cloud_node = None
 
@@ -459,11 +461,20 @@ class NodeManagerDaemonActor(actor_class):
             shutdown_actor, 'cloud_node', 'success', 'cancel_reason')
         cloud_node_id = cloud_node.id
         shutdown_actor.stop()
+
         if not success:
             if cancel_reason == self._node_shutdown.NODE_BROKEN:
                 self.cloud_nodes.blacklist(cloud_node_id)
             del self.shutdowns[cloud_node_id]
             del self.sizes_booting_shutdown[cloud_node_id]
+        else:
+            # If the node went from being booted to being shut down without ever
+            # appearing in the cloud node list, it will have the
+            # _nodemanager_recently_booted tag, so get rid of it so that the node
+            # can be forgotten completely.
+            if hasattr(self.cloud_nodes[cloud_node_id].cloud_node, "_nodemanager_recently_booted"):
+                del self.cloud_nodes[cloud_node_id].cloud_node._nodemanager_recently_booted
+
         # On success, we want to leave the entry in self.shutdowns so that it
         # won't try to shut down the node again.  It should disappear from the
         # cloud node list, and the entry in self.shutdowns will get cleaned up

commit 99b0c7c39b4941440f0fb8013abcbbdd9ccd12ea
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri May 13 16:09:10 2016 -0400

    9161: Add _nodemanager_recently_booted as new way of remembering nodes which are in intermediate state between being created and showing up in the cloud node list.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 366c1f8..40ac5dd 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -202,15 +202,25 @@ class NodeManagerDaemonActor(actor_class):
         self.try_pairing()
 
         for key, record in self.cloud_nodes.orphans.iteritems():
-            if key in self.shutdowns:
+            shutdown = key in self.shutdowns
+            if shutdown:
                 try:
                     self.shutdowns[key].stop().get()
                 except pykka.ActorDeadError:
                     pass
                 del self.shutdowns[key]
                 del self.sizes_booting_shutdown[key]
-            record.actor.stop()
-            record.cloud_node = None
+
+            # A recently booted node is a node that successfully completed the
+            # setup actor but has not yet appeared in the cloud node list.
+            # This will have the tag _nodemanager_recently_booted on it, which
+            # means we don't want to forget about it yet.  Once it appears in
+            # the cloud list, the object in record.cloud_node will be replaced
+            # by a new one that lacks the "_nodemanager_recently_booted" tag.
+            # However if node is being shut down, forget about it.
+            if (not hasattr(record.cloud_node, "_nodemanager_recently_booted")) or shutdown:
+                record.actor.stop()
+                record.cloud_node = None
 
     def _register_arvados_node(self, key, arv_node):
         self._logger.info("Registering new Arvados node %s", key)
@@ -230,7 +240,6 @@ class NodeManagerDaemonActor(actor_class):
                     self._pair_nodes(record, arv_rec.arvados_node)
                     break
 
-
     def _nodes_booting(self, size):
         s = sum(1
                 for c in self.booting.iterkeys()
@@ -242,24 +251,24 @@ class NodeManagerDaemonActor(actor_class):
                    for c in self.cloud_nodes.unpaired()
                    if size is None or c.cloud_node.size.id == size.id)
 
-    def _nodes_paired(self, size):
-        return sum(1
-                  for c in self.cloud_nodes.paired()
-                  if size is None or c.cloud_node.size.id == size.id)
-
     def _nodes_down(self, size):
         # Make sure to iterate over self.cloud_nodes because what we're
         # counting here are compute nodes that are reported by the cloud
         # provider but are considered "down" by Arvados.
         return sum(1 for down in
                    pykka.get_all(rec.actor.in_state('down') for rec in
-                                 self.cloud_nodes.nodes.itervalues()
+                                 self.cloud_nodes.paired()
                                  if ((size is None or rec.cloud_node.size.id == size.id) and
                                      rec.cloud_node.id not in self.shutdowns))
                    if down)
 
+    def _nodes_size(self, size):
+        return sum(1
+                  for c in self.cloud_nodes.nodes.itervalues()
+                  if size is None or c.cloud_node.size.id == size.id)
+
     def _nodes_up(self, size):
-        up = (self._nodes_booting(size) + self._nodes_unpaired(size) + self._nodes_paired(size)) - (self._nodes_down(size) + self._size_shutdowns(size))
+        up = (self._nodes_booting(size) + self._nodes_size(size)) - (self._nodes_down(size) + self._size_shutdowns(size))
         return up
 
     def _total_price(self):
@@ -286,7 +295,7 @@ class NodeManagerDaemonActor(actor_class):
                   if size is None or self.sizes_booting_shutdown[c].id == size.id)
 
     def _nodes_wanted(self, size):
-        total_up_count = self._nodes_up(None) + self._nodes_down(None)
+        total_up_count = self._nodes_booting(None) + self._nodes_size(None)
         under_min = self.min_nodes - total_up_count
         over_max = total_up_count - self.max_nodes
         total_price = self._total_price()
@@ -298,12 +307,12 @@ class NodeManagerDaemonActor(actor_class):
 
         up_count = self._nodes_up(size)
         booting_count = self._nodes_booting(size)
+        total_count = self._nodes_size(size)
         unpaired_count = self._nodes_unpaired(size)
-        paired_count = self._nodes_paired(size)
         busy_count = self._nodes_busy(size)
         down_count = self._nodes_down(size)
-        idle_count = paired_count - (busy_count+down_count)
         shutdown_count = self._size_shutdowns(size)
+        idle_count = total_count - (unpaired_count+busy_count+down_count+shutdown_count)
 
         self._logger.info("%s: wishlist %i, up %i (booting %i, unpaired %i, idle %i, busy %i), down %i, shutdown %i", size.name,
                           self._size_wishlist(size),
@@ -398,6 +407,7 @@ class NodeManagerDaemonActor(actor_class):
         # successful and so there isn't anything to do.
         if cloud_node is not None:
             # Node creation succeeded.  Update cloud node list.
+            cloud_node._nodemanager_recently_booted = True
             self._register_cloud_node(cloud_node)
         del self.booting[setup_proxy.actor_ref.actor_urn]
         del self.sizes_booting_shutdown[setup_proxy.actor_ref.actor_urn]
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 73b69d0..b6557d0 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -317,6 +317,16 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
 
+    def test_node_counted_after_boot_with_slow_listing(self):
+        # Test that, after we boot a compute node, we assume it exists
+        # even it doesn't appear in the listing (e.g., because of delays
+        # propagating tags).
+        setup = self.start_node_boot()
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+
     def test_booted_unlisted_node_counted(self):
         setup = self.start_node_boot(id_num=1)
         self.daemon.node_up(setup)

-----------------------------------------------------------------------


hooks/post-receive
--