[ARVADOS] updated: 5e75d51dd186ae59eeb595b7d0f6a93aa4aef271
Git user
git at public.curoverse.com
Mon May 16 16:36:47 EDT 2016
Summary of changes:
services/nodemanager/arvnodeman/computenode/dispatch/__init__.py | 9 +++++++--
services/nodemanager/arvnodeman/daemon.py | 2 --
2 files changed, 7 insertions(+), 4 deletions(-)
via 5e75d51dd186ae59eeb595b7d0f6a93aa4aef271 (commit)
via 5c549965a11b6a2ce789c1e0db9e418f695aed84 (commit)
from a7a16338702965de3ad0687470ef5beb2f42759b (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 5e75d51dd186ae59eeb595b7d0f6a93aa4aef271
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Mon May 16 16:36:41 2016 -0400
9161: Remove spurious prints
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 589b9a1..266b665 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -429,13 +429,11 @@ class NodeManagerDaemonActor(actor_class):
@_check_poll_freshness
def node_can_shutdown(self, node_actor):
if self._nodes_excess(node_actor.cloud_node.get().size) > 0:
- print("excess")
self._begin_node_shutdown(node_actor, cancellable=True)
elif self.cloud_nodes.nodes.get(node_actor.cloud_node.get().id).arvados_node is None:
# Node is unpaired, which means it probably exceeded its booting
# grace period without a ping, so shut it down so we can boot a new
# node in its place.
- print("unpaired")
self._begin_node_shutdown(node_actor, cancellable=False)
elif node_actor.in_state('down').get():
# Node is down and unlikely to come back.
commit 5c549965a11b6a2ce789c1e0db9e418f695aed84
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Mon May 16 14:30:06 2016 -0400
9161: Don't automatically consider nodes with job_uuid set to be 'busy'.
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
index f9dbd20..96b2353 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/__init__.py
@@ -371,8 +371,13 @@ class ComputeNodeMonitorActor(config.actor_class):
if arvados_node_missing(self.arvados_node, self.node_stale_after):
state = 'down'
- if state == 'idle' and self.arvados_node['job_uuid']:
- state = 'busy'
+ # Turns out using 'job_uuid' this way is a bad idea. The node record
+ # is assigned the job_uuid before the job is locked (which removes it
+ # from the queue) which means the job will be double-counted as both in
+ # the wishlist and but also keeping a node busy. This end result is
+ # excess nodes being booted.
+ #if state == 'idle' and self.arvados_node['job_uuid']:
+ # state = 'busy'
return state
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list