[ARVADOS] created: cf08eb6ac9c236be2ad84bd39a2275b44b525ae5

git at public.curoverse.com git at public.curoverse.com
Wed May 6 16:12:36 EDT 2015


        at  cf08eb6ac9c236be2ad84bd39a2275b44b525ae5 (commit)


commit cf08eb6ac9c236be2ad84bd39a2275b44b525ae5
Author: Brett Smith <brett at curoverse.com>
Date:   Wed May 6 15:59:34 2015 -0400

    5842: Node Manager only considers nodes busy if they're working.
    
    Previously, Node Manager considered any non-idle node as busy,
    including down nodes.  This causes it to boot replacements for nodes
    that are marked "down," even if that's because they're still
    bootstrapping.  Tighten the busy criteria to avoid booting excess
    nodes.
    
    It's easier to make this change now that Node Manager checks that
    nodes have a functional Crunch worker state to be considered a
    successful bootstrap.  This means that any node that's down later is
    in an unexpected state, and we should avoid messing with it.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 8552f2a..af8e608 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -202,10 +202,10 @@ class NodeManagerDaemonActor(actor_class):
                    [self.cloud_nodes, self.booted, self.booting])
 
     def _nodes_busy(self):
-        return sum(1 for idle in
-                   pykka.get_all(rec.actor.in_state('idle') for rec in
+        return sum(1 for busy in
+                   pykka.get_all(rec.actor.in_state('busy') for rec in
                                  self.cloud_nodes.nodes.itervalues())
-                   if idle is False)
+                   if busy)
 
     def _nodes_wanted(self):
         up_count = self._nodes_up()
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index dc52ae8..b406f13 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -183,6 +183,19 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.last_setup.arvados_node.get.return_value = arv_node
         return self.last_setup
 
+    def test_no_new_node_when_booted_node_not_usable(self):
+        cloud_node = testutil.cloud_node_mock(4)
+        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node])
+        self.daemon.update_server_wishlist(
+            [testutil.MockSize(1)]).get(self.TIMEOUT)
+        self.stop_proxy(self.daemon)
+        self.assertEqual(1, self.node_setup.start.call_count)
+
     def test_no_duplication_when_booting_node_listed_fast(self):
         # Test that we don't start two ComputeNodeMonitorActors when
         # we learn about a booting node through a listing before we

commit d52af99ccc62507d6be159f74f6d200c0129f668
Author: Brett Smith <brett at curoverse.com>
Date:   Wed May 6 16:07:33 2015 -0400

    5842: Node Manager shuts down booted nodes if they can't do compute work.
    
    If a booted node is not able to do work (it's not idle or busy),
    consider that a bootstrapping failure and shut it down, just like a
    failure to pair with an Arvados node.

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index ba52871..8552f2a 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -327,7 +327,7 @@ class NodeManagerDaemonActor(actor_class):
                 break
         else:
             return None
-        if record.arvados_node is None:
+        if not record.actor.in_state('idle', 'busy').get():
             self._begin_node_shutdown(record.actor, cancellable=False)
 
     def node_finished_shutdown(self, shutdown_actor):
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 228b552..dc52ae8 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -270,6 +270,18 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertShutdownCancellable(False)
 
+    def test_booted_node_shut_down_when_never_working(self):
+        cloud_node = testutil.cloud_node_mock(4)
+        arv_node = testutil.arvados_node_mock(4, crunch_worker_state='down')
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        self.timer.deliver()
+        self.stop_proxy(self.daemon)
+        self.assertShutdownCancellable(False)
+
     def test_node_that_pairs_not_considered_failed_boot(self):
         cloud_node = testutil.cloud_node_mock(3)
         arv_node = testutil.arvados_node_mock(3)
@@ -282,6 +294,18 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.stop_proxy(self.daemon)
         self.assertFalse(self.node_shutdown.start.called)
 
+    def test_node_that_pairs_busy_not_considered_failed_boot(self):
+        cloud_node = testutil.cloud_node_mock(5)
+        arv_node = testutil.arvados_node_mock(5, job_uuid=True)
+        setup = self.start_node_boot(cloud_node, arv_node)
+        self.daemon.node_up(setup).get(self.TIMEOUT)
+        self.assertEqual(1, self.alive_monitor_count())
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.daemon.update_arvados_nodes([arv_node]).get(self.TIMEOUT)
+        self.timer.deliver()
+        self.stop_proxy(self.daemon)
+        self.assertFalse(self.node_shutdown.start.called)
+
     def test_booting_nodes_shut_down(self):
         self.make_daemon(want_sizes=[testutil.MockSize(1)])
         self.daemon.update_server_wishlist([]).get(self.TIMEOUT)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list