[ARVADOS] updated: 1.1.4-3-gab3afbb68
Git user
git at public.curoverse.com
Mon Jul 16 17:21:03 EDT 2018
Summary of changes:
.../arvnodeman/computenode/dispatch/__init__.py | 21 ++++++--
services/nodemanager/arvnodeman/daemon.py | 62 ++++++++++++++--------
services/nodemanager/tests/integration_test.py | 7 ++-
3 files changed, 63 insertions(+), 27 deletions(-)
via ab3afbb684bc1b32577c2696e13882123bfff7d2 (commit)
via 265fe64e5b7a931736f156c3cb446fbbbc27f018 (commit)
from 83d08d7ccbc622ec97948929c83fb91f96743ca2 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit ab3afbb684bc1b32577c2696e13882123bfff7d2
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Mon Jul 16 17:19:53 2018 -0400
13804: Try to cancel pending shutdowns if nodes are needed
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index d9b475b90..bb83a193f 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -243,12 +243,15 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
return super(ComputeNodeShutdownActor, self)._finished()
def cancel_shutdown(self, reason, **kwargs):
+ if not self.cancellable:
+ return False
if self.cancel_reason is not None:
# already cancelled
- return
+ return False
self.cancel_reason = reason
self._logger.info("Shutdown cancelled: %s.", reason)
self._finished(success_flag=False)
+ return True
def _cancel_on_exception(orig_func):
@functools.wraps(orig_func)
@@ -282,6 +285,7 @@ class ComputeNodeShutdownActor(ComputeNodeStateChangeBase):
self._logger.info("Starting shutdown")
arv_node = self._arvados_node()
if self._cloud.destroy_node(self.cloud_node):
+ self.cancellable = False
self._logger.info("Shutdown success")
if arv_node:
self._later.clean_arvados_node(arv_node)
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 0d6fdfca9..2e71b316d 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -390,22 +390,25 @@ class NodeManagerDaemonActor(actor_class):
nodes_wanted = self._nodes_wanted(cloud_size)
if nodes_wanted < 1:
return None
- arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
- self._logger.info("Want %i more %s nodes. Booting a node.",
- nodes_wanted, cloud_size.name)
- new_setup = self._node_setup.start(
- timer_actor=self._timer,
- arvados_client=self._new_arvados(),
- arvados_node=arvados_node,
- cloud_client=self._new_cloud(),
- cloud_size=self.server_calculator.find_size(cloud_size.id))
- self.booting[new_setup.actor_urn] = new_setup.proxy()
- self.sizes_booting[new_setup.actor_urn] = cloud_size
-
- if arvados_node is not None:
- self.arvados_nodes[arvados_node['uuid']].assignment_time = (
- time.time())
- new_setup.tell_proxy().subscribe(self._later.node_setup_finished)
+
+ if not self.cancel_node_shutdown(cloud_size):
+ arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+ self._logger.info("Want %i more %s nodes. Booting a node.",
+ nodes_wanted, cloud_size.name)
+ new_setup = self._node_setup.start(
+ timer_actor=self._timer,
+ arvados_client=self._new_arvados(),
+ arvados_node=arvados_node,
+ cloud_client=self._new_cloud(),
+ cloud_size=self.server_calculator.find_size(cloud_size.id))
+ self.booting[new_setup.actor_urn] = new_setup.proxy()
+ self.sizes_booting[new_setup.actor_urn] = cloud_size
+
+ if arvados_node is not None:
+ self.arvados_nodes[arvados_node['uuid']].assignment_time = (
+ time.time())
+ new_setup.tell_proxy().subscribe(self._later.node_setup_finished)
+
if nodes_wanted > 1:
self._later.start_node(cloud_size)
@@ -456,13 +459,28 @@ class NodeManagerDaemonActor(actor_class):
if (nodes_excess < 1) or not self.booting:
return None
for key, node in self.booting.iteritems():
- if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
- del self.booting[key]
- del self.sizes_booting[key]
+ try:
+ if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get(2):
+ del self.booting[key]
+ del self.sizes_booting[key]
+ if nodes_excess > 1:
+ self._later.stop_booting_node(size)
+ return
+ except pykka.Timeout:
+ pass
- if nodes_excess > 1:
- self._later.stop_booting_node(size)
- break
+ @_check_poll_freshness
+ def cancel_node_shutdown(self, size):
+ # Go through shutdown actors and see if there are any of the appropriate size that can be cancelled
+ for record in self.cloud_nodes.nodes.itervalues():
+ try:
+ if (record.shutdown_actor is not None and
+ record.size.id == size.id and
+ record.shutdown_actor.cancel_shutdown("Node size is in wishlist").get(2)):
+ return True
+ except (pykka.ActorDeadError, pykka.Timeout) as e:
+ pass
+ return False
def _begin_node_shutdown(self, node_actor, cancellable):
cloud_node_obj = node_actor.cloud_node.get()
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index 508e62663..284b361ce 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -124,8 +124,11 @@ def node_busy(g):
def node_shutdown(g):
global compute_nodes
- del compute_nodes[g.group(1)]
- return 0
+ if g.group(1) in compute_nodes:
+ del compute_nodes[g.group(1)]
+ return 0
+ else:
+ return 1
def jobs_req(g):
global all_jobs
commit 265fe64e5b7a931736f156c3cb446fbbbc27f018
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Mon Jul 16 16:05:02 2018 -0400
13804: Node must report as "idle" two consecutive times
Gives idle nodes 1 poll period to be allocated by slurm before
becoming candidates for shutdown.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index 9106ea67c..d9b475b90 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -350,6 +350,7 @@ class ComputeNodeMonitorActor(config.actor_class):
self.boot_fail_after = boot_fail_after
self.subscribers = set()
self.arvados_node = None
+ self.consecutive_idle = 0
self._later.update_arvados_node(arvados_node)
self.last_shutdown_opening = None
self._later.consider_shutdown()
@@ -451,8 +452,14 @@ class ComputeNodeMonitorActor(config.actor_class):
else:
boot_grace = "boot exceeded"
- # API server side not implemented yet.
- idle_grace = 'idle exceeded'
+ if crunch_worker_state == "idle":
+ # Must report as "idle" at least two consecutive times
+ if self.consecutive_idle < 2:
+ idle_grace = 'idle wait'
+ else:
+ idle_grace = 'idle exceeded'
+ else:
+ idle_grace = 'not idle'
node_state = (crunch_worker_state, window, boot_grace, idle_grace)
t = transitions[node_state]
@@ -512,4 +519,8 @@ class ComputeNodeMonitorActor(config.actor_class):
if arvados_node is not None:
self.arvados_node = arvados_node
self._update.sync_node(self.cloud_node, self.arvados_node)
+ if self.arvados_node['crunch_worker_state'] == "idle":
+ self.consecutive_idle += 1
+ else:
+ self.consecutive_idle = 0
self._later.consider_shutdown()
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list