[ARVADOS] updated: 88b0204247e448ba5f715cb01adba4597bb6aae7

git at public.curoverse.com git at public.curoverse.com
Wed May 6 16:57:48 EDT 2015


Summary of changes:
 services/nodemanager/arvnodeman/daemon.py |  8 +++----
 services/nodemanager/tests/test_daemon.py | 37 +++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

       via  88b0204247e448ba5f715cb01adba4597bb6aae7 (commit)
       via  e81a3e9b46d0c7500079df3959e07074068ceb1b (commit)
       via  42b7aeb02200285fe3390eefa48be28baee3ca58 (commit)
      from  e7e61999bf577b3ba5308c747dda4a0011516aa7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 88b0204247e448ba5f715cb01adba4597bb6aae7
Merge: e7e6199 e81a3e9
Author: Brett Smith <brett at curoverse.com>
Date:   Wed May 6 16:57:29 2015 -0400

    Merge branch '5842-node-manager-tighten-successful-boot-criteria-wip'
    
    Closes #5842, #5931.


commit e81a3e9b46d0c7500079df3959e07074068ceb1b
Author: Brett Smith <brett at curoverse.com>
Date:   Wed May 6 15:59:34 2015 -0400

    5842: Node Manager only considers nodes busy if they're working.
    
    Previously, Node Manager considered any non-idle node as busy,
    including down nodes.  This causes it to boot replacements for nodes
    that are marked "down," even if that's because they're still
    bootstrapping.  Tighten the busy criteria to avoid booting excess
    nodes.
    
    It's easier to make this change now that Node Manager checks that
    nodes have a functional Crunch worker state to be considered a
    successful bootstrap.  This means that any node that's down later is
    in an unexpected state, and we should avoid messing with it.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8552f2a..af8e608 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -202,10 +202,10 @@ class NodeManagerDaemonActor(actor_class):
                    [self.cloud_nodes, self.booted, self.booting])
 
     def _nodes_busy(self):
-        return sum(1 for idle in
-                   pykka.get_all(rec.actor.in_state('idle') for rec in
+        return sum(1 for busy in
+                   pykka.get_all(rec.actor.in_state('busy') for rec in
                                  self.cloud_nodes.nodes.itervalues())
-                   if idle is False)
+                   if busy)
 
     def _nodes_wanted(self):
         up_count = self._nodes_up()
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index dc52ae8..b406f13 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -183,6 +183,19 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.last_setup.arvados_node.get.return_value = arv_node
         return self.last_setup
 
+    def test_no_new_node_when_booted_node_not_usable(self):
+        cloud_node = testutil.cloud_node_mock(4)
+        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node])
+        self.daemon.update_server_wishlist(
+            [testutil.MockSize(1)]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_setup.start.call_count)
+
     def test_no_duplication_when_booting_node_listed_fast(self):
         # Test that we don't start two ComputeNodeMonitorActors when
         # we learn about a booting node through a listing before we

commit 42b7aeb02200285fe3390eefa48be28baee3ca58
Author: Brett Smith <brett at curoverse.com>
Date:   Wed May 6 16:07:33 2015 -0400

    5842: Node Manager shuts down booted nodes if they can't do compute work.
    
    If a booted node is not able to do work (it's not idle or busy),
    consider that a bootstrapping failure and shut it down, just like a
    failure to pair with an Arvados node.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index ba52871..8552f2a 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -327,7 +327,7 @@ class NodeManagerDaemonActor(actor_class):
                 break
         else:
             return None
-        if record.arvados_node is None:
+        if not record.actor.in_state('idle', 'busy').get():
             self._begin_node_shutdown(record.actor, cancellable=False)
 
     def node_finished_shutdown(self, shutdown_actor):
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 228b552..dc52ae8 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -270,6 +270,18 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertShutdownCancellable(False)
 
+    def test_booted_node_shut_down_when_never_working(self):
+        cloud_node = testutil.cloud_node_mock(4)
+        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        self.timer.deliver()
+        self.stop_proxy(self.daemon)
+        self.assertShutdownCancellable(False)
+
     def test_node_that_pairs_not_considered_failed_boot(self):
         cloud_node = testutil.cloud_node_mock(3)
         arv_node = testutil.arvados_node_mock(3)
@@ -282,6 +294,18 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertFalse(self.node_shutdown.start.called)
 
+    def test_node_that_pairs_busy_not_considered_failed_boot(self):
+        cloud_node = testutil.cloud_node_mock(5)
+        arv_node = testutil.arvados_node_mock(5, job_uuid=True)
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        self.timer.deliver()
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_shutdown.start.called)
+
     def test_booting_nodes_shut_down(self):
         self.make_daemon(want_sizes=[testutil.MockSize(1)])
         self.daemon.update_server_wishlist([]).get(self.TIMEOUT)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list