[ARVADOS] updated: caacfc031998dc73cd2f4c767e1a746b7783d379

Git user git at public.curoverse.com
Fri May 13 14:26:38 EDT 2016


Summary of changes:
 .../arvnodeman/computenode/dispatch/__init__.py    | 17 +++++++--
 services/nodemanager/arvnodeman/daemon.py          | 42 +++++++++-------------
 .../nodemanager/tests/test_computenode_dispatch.py | 25 +++++++------
 services/nodemanager/tests/test_daemon.py          | 19 ++++------
 4 files changed, 50 insertions(+), 53 deletions(-)

       via  caacfc031998dc73cd2f4c767e1a746b7783d379 (commit)
      from  fd75f94f65d11989985bffe8d85fb6da7f75720e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit caacfc031998dc73cd2f4c767e1a746b7783d379
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Fri May 13 14:26:30 2016 -0400

    9161: Adjusting behavior to accomodate down/broken/missing nodes.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index b43e8a3..8674f16 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -88,7 +88,7 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
     Manager to handle).
     """
     def __init__(self, timer_actor, arvados_client, cloud_client,
-                 cloud_size, arvados_node,
+                 cloud_size, arvados_node=None,
                  retry_wait=1, max_retry_wait=180):
         super(ComputeNodeSetupActor, self).__init__(
             cloud_client, arvados_client, timer_actor,
@@ -96,7 +96,16 @@ class ComputeNodeSetupActor(ComputeNodeStateChangeBase):
         self.cloud_size = cloud_size
         self.arvados_node = None
         self.cloud_node = None
-        self._later.prepare_arvados_node(arvados_node)
+        if arvados_node is None:
+            self._later.create_arvados_node()
+        else:
+            self._later.prepare_arvados_node(arvados_node)
+
+    @ComputeNodeStateChangeBase._finish_on_exception
+    @RetryMixin._retry(config.ARVADOS_ERRORS)
+    def create_arvados_node(self):
+        self.arvados_node = self._arvados.nodes().create(body={}).execute()
+        self._later.create_cloud_node()
 
     @ComputeNodeStateChangeBase._finish_on_exception
     @RetryMixin._retry(config.ARVADOS_ERRORS)
@@ -350,7 +359,8 @@ class ComputeNodeMonitorActor(config.actor_class):
         if (state == 'down' and
             self.arvados_node['first_ping_at'] and
             timestamp_fresh(self.cloud_node_start_time,
-                            self.boot_fail_after)):
+                            self.boot_fail_after) and
+            not self._cloud.broken(self.cloud_node)):
             state = 'idle'
 
         # "missing" means last_ping_at is stale, this should be
@@ -361,6 +371,7 @@ class ComputeNodeMonitorActor(config.actor_class):
         result = state in states
         if state == 'idle':
             result = result and not self.arvados_node['job_uuid']
+
         return result
 
     def shutdown_eligible(self):
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index c4b0f3b..366c1f8 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -254,11 +254,12 @@ class NodeManagerDaemonActor(actor_class):
         return sum(1 for down in
                    pykka.get_all(rec.actor.in_state('down') for rec in
                                  self.cloud_nodes.nodes.itervalues()
-                                 if size is None or rec.cloud_node.size.id == size.id)
+                                 if ((size is None or rec.cloud_node.size.id == size.id) and
+                                     rec.cloud_node.id not in self.shutdowns))
                    if down)
 
     def _nodes_up(self, size):
-        up = (self._nodes_booting(size) + self._nodes_unpaired(size) + self._nodes_paired(size)) - self._nodes_down(size)
+        up = (self._nodes_booting(size) + self._nodes_unpaired(size) + self._nodes_paired(size)) - (self._nodes_down(size) + self._size_shutdowns(size))
         return up
 
     def _total_price(self):
@@ -280,17 +281,12 @@ class NodeManagerDaemonActor(actor_class):
         return sum(1 for c in self.last_wishlist if c.id == size.id)
 
     def _size_shutdowns(self, size):
-        sh = 0
-        for c in self.shutdowns.iterkeys():
-            try:
-                if self.sizes_booting_shutdown[c].id == size.id:
-                    sh += 1
-            except pykka.ActorDeadError:
-                pass
-        return sh
+        return sum(1
+                  for c in self.shutdowns.iterkeys()
+                  if size is None or self.sizes_booting_shutdown[c].id == size.id)
 
     def _nodes_wanted(self, size):
-        total_up_count = self._nodes_up(None)
+        total_up_count = self._nodes_up(None) + self._nodes_down(None)
         under_min = self.min_nodes - total_up_count
         over_max = total_up_count - self.max_nodes
         total_price = self._total_price()
@@ -370,23 +366,17 @@ class NodeManagerDaemonActor(actor_class):
         nodes_wanted = self._nodes_wanted(cloud_size)
         if nodes_wanted < 1:
             return None
+        arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
         self._logger.info("Want %i more %s nodes.  Booting a node.",
                           nodes_wanted, cloud_size.name)
-
-        arvados_client=self._new_arvados()
-        arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
-        if not arvados_node:
-            arvados_node = arvados_client.nodes().create(body={}).execute()
-            self._register_arvados_node(arvados_node["uuid"], arvados_node)
-
         new_setup = self._node_setup.start(
             timer_actor=self._timer,
-            arvados_client=arvados_client,
+            arvados_client=self._new_arvados(),
             arvados_node=arvados_node,
             cloud_client=self._new_cloud(),
             cloud_size=cloud_size).proxy()
-        self.booting[arvados_node['uuid']] = new_setup
-        self.sizes_booting_shutdown[arvados_node['uuid']] = cloud_size
+        self.booting[new_setup.actor_ref.actor_urn] = new_setup
+        self.sizes_booting_shutdown[new_setup.actor_ref.actor_urn] = cloud_size
 
         if arvados_node is not None:
             self.arvados_nodes[arvados_node['uuid']].assignment_time = (
@@ -409,10 +399,8 @@ class NodeManagerDaemonActor(actor_class):
         if cloud_node is not None:
             # Node creation succeeded.  Update cloud node list.
             self._register_cloud_node(cloud_node)
-            arvuuid = arvados_node["uuid"]
-            if arvuuid in self.booting:
-                del self.booting[arvuuid]
-                del self.sizes_booting_shutdown[arvuuid]
+        del self.booting[setup_proxy.actor_ref.actor_urn]
+        del self.sizes_booting_shutdown[setup_proxy.actor_ref.actor_urn]
 
     @_check_poll_freshness
     def stop_booting_node(self, size):
@@ -444,13 +432,15 @@ class NodeManagerDaemonActor(actor_class):
     @_check_poll_freshness
     def node_can_shutdown(self, node_actor):
         if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
+            print("excess")
             self._begin_node_shutdown(node_actor, cancellable=True)
         elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
             # Node is unpaired, which means it probably exceeded its booting
             # grace period without a ping, so shut it down so we can boot a new
             # node in its place.
+            print("unpaired")
             self._begin_node_shutdown(node_actor, cancellable=False)
-        elif node_actor.in_state('down'):
+        elif node_actor.in_state('down').get():
             # Node is down and unlikely to come back.
             self._begin_node_shutdown(node_actor, cancellable=False)
 
diff --git a/services/nodemanager/tests/test_computenode_dispatch.py b/services/nodemanager/tests/test_computenode_dispatch.py
index 8def853..bf86e69 100644
--- a/services/nodemanager/tests/test_computenode_dispatch.py
+++ b/services/nodemanager/tests/test_computenode_dispatch.py
@@ -356,7 +356,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
     def test_shutdown_without_arvados_node(self):
         self.make_actor(start_time=0)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"))
+        self.assertEquals((True, "node state is ('unpaired', 'open', 'boot exceeded', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing(self):
         arv_node = testutil.arvados_node_mock(10, job_uuid=None,
@@ -364,7 +365,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
                                               last_ping_at='1970-01-01T01:02:03.04050607Z')
         self.make_actor(10, arv_node)
         self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_running_broken(self):
         arv_node = testutil.arvados_node_mock(12, job_uuid=None,
@@ -372,7 +374,8 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
         self.make_actor(12, arv_node)
         self.shutdowns._set_state(True, 600)
         self.cloud_client.broken.return_value = True
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_shutdown_missing_broken(self):
         arv_node = testutil.arvados_node_mock(11, job_uuid=None,
@@ -385,27 +388,27 @@ class ComputeNodeMonitorActorTestCase(testutil.ActorTestMixin,
 
     def test_no_shutdown_when_window_closed(self):
         self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"))
+        self.assertEquals((False, "node state is ('idle', 'closed', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_running_job(self):
         self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"))
+        self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_unknown(self):
         self.make_actor(5, testutil.arvados_node_mock(
             5, crunch_worker_state=None))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node is paired but crunch_worker_state is 'None'"))
+        self.assertEquals((False, "node is paired but crunch_worker_state is 'None'"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_no_shutdown_when_node_state_stale(self):
         self.make_actor(6, testutil.arvados_node_mock(6, age=90000))
         self.shutdowns._set_state(True, 600)
-        self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
-                          (False, "node state is stale"))
+        self.assertEquals((False, "node state is stale"),
+                          self.node_actor.shutdown_eligible().get(self.TIMEOUT))
 
     def test_arvados_node_match(self):
         self.make_actor(2)
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 3cc3e78..73b69d0 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -292,8 +292,9 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         setup = self.start_node_boot(cloud_node, arv_node)
         self.daemon.node_up(setup).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([cloud_node])
         self.daemon.update_arvados_nodes([arv_node])
+        self.daemon.update_cloud_nodes([cloud_node])
+        self.monitor_list()[0].proxy().cloud_node_start_time = time.time()-1801
         self.daemon.update_server_wishlist(
             [testutil.MockSize(1)]).get(self.TIMEOUT)
         self.stop_proxy(self.daemon)
@@ -316,16 +317,6 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         self.daemon.update_cloud_nodes([cloud_node]).get(self.TIMEOUT)
         self.assertEqual(1, self.alive_monitor_count())
 
-    def test_node_counted_after_boot_with_slow_listing(self):
-        # Test that, after we boot a compute node, we assume it exists
-        # even it doesn't appear in the listing (e.g., because of delays
-        # propagating tags).
-        setup = self.start_node_boot()
-        self.daemon.node_up(setup).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-        self.daemon.update_cloud_nodes([]).get(self.TIMEOUT)
-        self.assertEqual(1, self.alive_monitor_count())
-
     def test_booted_unlisted_node_counted(self):
         setup = self.start_node_boot(id_num=1)
         self.daemon.node_up(setup)
@@ -455,8 +446,9 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_shutdown_declined_at_wishlist_capacity(self):
         cloud_node = testutil.cloud_node_mock(1)
+        arv_node = testutil.arvados_node_mock(1)
         size = testutil.MockSize(1)
-        self.make_daemon(cloud_nodes=[cloud_node], want_sizes=[size])
+        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], want_sizes=[size])
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
         self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
@@ -465,7 +457,8 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
 
     def test_shutdown_declined_below_min_nodes(self):
         cloud_node = testutil.cloud_node_mock(1)
-        self.make_daemon(cloud_nodes=[cloud_node], min_nodes=1)
+        arv_node = testutil.arvados_node_mock(1)
+        self.make_daemon(cloud_nodes=[cloud_node], arvados_nodes=[arv_node], min_nodes=1)
         self.assertEqual(1, self.alive_monitor_count())
         monitor = self.monitor_list()[0].proxy()
         self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list