[ARVADOS] created: 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc

git at public.curoverse.com git at public.curoverse.com
Tue Sep 29 10:25:26 EDT 2015


        at  72e3566f2cdacd44f095183ebf88f7aab8b0d8dc (commit)


commit 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue Sep 29 10:27:10 2015 -0400

    7286: Move logic to shut down newly booted nodes nodes that haven't pinged to
    ComputeNodeMonitorActor.  Shut down nodes if they have "missing" status and are
    "broken" according to the cloud client.  Don't count "missing" nodes as "up"
    when deciding whether to boot more nodes.

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 6d5c223..4557198 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -273,8 +273,10 @@ class ComputeNodeMonitorActor(config.actor_class):
     for shutdown.
     """
     def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
-                 cloud_fqdn_func, timer_actor, update_actor, arvados_node=None,
-                 poll_stale_after=600, node_stale_after=3600):
+                 cloud_fqdn_func, timer_actor, update_actor, cloud_client,
+                 arvados_node=None, poll_stale_after=600, node_stale_after=3600,
+                 boot_fail_after=1800
+    ):
         super(ComputeNodeMonitorActor, self).__init__()
         self._later = self.actor_ref.proxy()
         self._logger = logging.getLogger('arvnodeman.computenode')
@@ -283,10 +285,12 @@ class ComputeNodeMonitorActor(config.actor_class):
         self._cloud_node_fqdn = cloud_fqdn_func
         self._timer = timer_actor
         self._update = update_actor
+        self._cloud = cloud_client
         self.cloud_node = cloud_node
         self.cloud_node_start_time = cloud_node_start_time
         self.poll_stale_after = poll_stale_after
         self.node_stale_after = node_stale_after
+        self.boot_fail_after = boot_fail_after
         self.subscribers = set()
         self.arvados_node = None
         self._later.update_arvados_node(arvados_node)
@@ -322,10 +326,13 @@ class ComputeNodeMonitorActor(config.actor_class):
         if not self._shutdowns.window_open():
             return False
         elif self.arvados_node is None:
-            # If this is a new, unpaired node, it's eligible for
-            # shutdown--we figure there was an error during bootstrap.
-            return timestamp_fresh(self.cloud_node_start_time,
-                                   self.node_stale_after)
+            # Node is unpaired.
+            # If it hasn't pinged Arvados after boot_fail seconds, shut it down
+            return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
+        elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+            # Node is paired, but Arvados says it is missing and the cloud says the node
+            # is in an error state, so shut it down.
+            return True
         else:
             return self.in_state('idle')
 
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 724c772..14e804f 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -103,6 +103,10 @@ class BaseComputeNodeDriver(object):
         """
         raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
 
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        return False
+
     def _make_ping_url(self, arvados_node):
         return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
             self.ping_host, arvados_node['uuid'],
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index b1494d0..ba3c9b0 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -81,6 +81,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
                 super(ComputeNodeDriver, self).list_nodes()
                 if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
 
+    def broken(self, cloud_node):
+        """Return true if libcloud has indicated the node is in a "broken" state."""
+        return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
+
     @classmethod
     def node_fqdn(cls, node):
         return node.extra["tags"].get("hostname")
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 44f1513..ed8c7d5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -159,7 +159,9 @@ class NodeManagerDaemonActor(actor_class):
             timer_actor=self._timer,
             arvados_node=None,
             poll_stale_after=self.poll_stale_after,
-            node_stale_after=self.node_stale_after).proxy()
+            node_stale_after=self.node_stale_after,
+            cloud_client=self._cloud_driver,
+            boot_fail_after=self.boot_fail_after).proxy()
         actor.subscribe(self._later.node_can_shutdown)
         self._cloud_nodes_actor.subscribe_to(cloud_node.id,
                                              actor.update_cloud_node)
@@ -207,6 +209,12 @@ class NodeManagerDaemonActor(actor_class):
                                  self.cloud_nodes.nodes.itervalues())
                    if busy)
 
+    def _nodes_missing(self):
+        return sum(1 for arv_node in
+                   pykka.get_all(rec.actor.arvados_node for rec in
+                                 self.cloud_nodes.nodes.itervalues())
+                   if arv_node and arv_node.get("status") == "missing")
+
     def _nodes_wanted(self):
         up_count = self._nodes_up()
         under_min = self.min_nodes - up_count
@@ -216,11 +224,11 @@ class NodeManagerDaemonActor(actor_class):
         elif under_min > 0:
             return under_min
         else:
-            up_count -= len(self.shutdowns) + self._nodes_busy()
+            up_count -= len(self.shutdowns) + self._nodes_busy() + self._nodes_missing()
             return len(self.last_wishlist) - up_count
 
     def _nodes_excess(self):
-        up_count = self._nodes_up() - len(self.shutdowns)
+        up_count = self._nodes_up() - len(self.shutdowns) - self._nodes_missing()
         over_min = up_count - self.min_nodes
         if over_min <= 0:
             return over_min

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list