[ARVADOS] created: 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc
git at public.curoverse.com
git at public.curoverse.com
Tue Sep 29 10:25:26 EDT 2015
at 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc (commit)
commit 72e3566f2cdacd44f095183ebf88f7aab8b0d8dc
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue Sep 29 10:27:10 2015 -0400
7286: Move logic to shut down newly booted nodes nodes that haven't pinged to
ComputeNodeMonitorActor. Shut down nodes if they have "missing" status and are
"broken" according to the cloud client. Don't count "missing" nodes as "up"
when deciding whether to boot more nodes.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 6d5c223..4557198 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -273,8 +273,10 @@ class ComputeNodeMonitorActor(config.actor_class):
for shutdown.
"""
def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
- cloud_fqdn_func, timer_actor, update_actor, arvados_node=None,
- poll_stale_after=600, node_stale_after=3600):
+ cloud_fqdn_func, timer_actor, update_actor, cloud_client,
+ arvados_node=None, poll_stale_after=600, node_stale_after=3600,
+ boot_fail_after=1800
+ ):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.proxy()
self._logger = logging.getLogger('arvnodeman.computenode')
@@ -283,10 +285,12 @@ class ComputeNodeMonitorActor(config.actor_class):
self._cloud_node_fqdn = cloud_fqdn_func
self._timer = timer_actor
self._update = update_actor
+ self._cloud = cloud_client
self.cloud_node = cloud_node
self.cloud_node_start_time = cloud_node_start_time
self.poll_stale_after = poll_stale_after
self.node_stale_after = node_stale_after
+ self.boot_fail_after = boot_fail_after
self.subscribers = set()
self.arvados_node = None
self._later.update_arvados_node(arvados_node)
@@ -322,10 +326,13 @@ class ComputeNodeMonitorActor(config.actor_class):
if not self._shutdowns.window_open():
return False
elif self.arvados_node is None:
- # If this is a new, unpaired node, it's eligible for
- # shutdown--we figure there was an error during bootstrap.
- return timestamp_fresh(self.cloud_node_start_time,
- self.node_stale_after)
+ # Node is unpaired.
+ # If it hasn't pinged Arvados after boot_fail seconds, shut it down
+ return not timestamp_fresh(self.cloud_node_start_time, self.boot_fail_after)
+ elif self.arvados_node.get('status') == "missing" and self._cloud.broken(self.cloud_node):
+ # Node is paired, but Arvados says it is missing and the cloud says the node
+ # is in an error state, so shut it down.
+ return True
else:
return self.in_state('idle')
diff --git a/services/nodemanager/arvnodeman/computenode/driver/__init__.py b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
index 724c772..14e804f 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/__init__.py
@@ -103,6 +103,10 @@ class BaseComputeNodeDriver(object):
"""
raise NotImplementedError("BaseComputeNodeDriver.arvados_create_kwargs")
+ def broken(self, cloud_node):
+ """Return true if libcloud has indicated the node is in a "broken" state."""
+ return False
+
def _make_ping_url(self, arvados_node):
return 'https://{}/arvados/v1/nodes/{}/ping?ping_secret={}'.format(
self.ping_host, arvados_node['uuid'],
diff --git a/services/nodemanager/arvnodeman/computenode/driver/azure.py b/services/nodemanager/arvnodeman/computenode/driver/azure.py
index b1494d0..ba3c9b0 100644
--- a/services/nodemanager/arvnodeman/computenode/driver/azure.py
+++ b/services/nodemanager/arvnodeman/computenode/driver/azure.py
@@ -81,6 +81,10 @@ class ComputeNodeDriver(BaseComputeNodeDriver):
super(ComputeNodeDriver, self).list_nodes()
if node.extra["tags"].get("arvados-class") == self.tags["arvados-class"]]
+ def broken(self, cloud_node):
+ """Return true if libcloud has indicated the node is in a "broken" state."""
+ return (cloud_node.state in (cloud_types.NodeState.ERROR, cloud_types.NodeState.UNKNOWN))
+
@classmethod
def node_fqdn(cls, node):
return node.extra["tags"].get("hostname")
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 44f1513..ed8c7d5 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -159,7 +159,9 @@ class NodeManagerDaemonActor(actor_class):
timer_actor=self._timer,
arvados_node=None,
poll_stale_after=self.poll_stale_after,
- node_stale_after=self.node_stale_after).proxy()
+ node_stale_after=self.node_stale_after,
+ cloud_client=self._cloud_driver,
+ boot_fail_after=self.boot_fail_after).proxy()
actor.subscribe(self._later.node_can_shutdown)
self._cloud_nodes_actor.subscribe_to(cloud_node.id,
actor.update_cloud_node)
@@ -207,6 +209,12 @@ class NodeManagerDaemonActor(actor_class):
self.cloud_nodes.nodes.itervalues())
if busy)
+ def _nodes_missing(self):
+ return sum(1 for arv_node in
+ pykka.get_all(rec.actor.arvados_node for rec in
+ self.cloud_nodes.nodes.itervalues())
+ if arv_node and arv_node.get("status") == "missing")
+
def _nodes_wanted(self):
up_count = self._nodes_up()
under_min = self.min_nodes - up_count
@@ -216,11 +224,11 @@ class NodeManagerDaemonActor(actor_class):
elif under_min > 0:
return under_min
else:
- up_count -= len(self.shutdowns) + self._nodes_busy()
+ up_count -= len(self.shutdowns) + self._nodes_busy() + self._nodes_missing()
return len(self.last_wishlist) - up_count
def _nodes_excess(self):
- up_count = self._nodes_up() - len(self.shutdowns)
+ up_count = self._nodes_up() - len(self.shutdowns) - self._nodes_missing()
over_min = up_count - self.min_nodes
if over_min <= 0:
return over_min
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list