[ARVADOS] updated: 1.1.2-142-gfa2effb
Git user
git at public.curoverse.com
Fri Feb 2 16:47:54 EST 2018
Summary of changes:
.../arvnodeman/computenode/dispatch/slurm.py | 11 ++++----
services/nodemanager/arvnodeman/nodelist.py | 33 +++++++++++++---------
2 files changed, 24 insertions(+), 20 deletions(-)
via fa2effb40b8fc62e239f2746e9626569e27d9d62 (commit)
from 3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit fa2effb40b8fc62e239f2746e9626569e27d9d62
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date: Fri Feb 2 16:46:32 2018 -0500
12199: Track slurm node features, avoid redundant updates.
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 13a2a6a..76d92e2 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -108,12 +108,11 @@ class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
def sync_node(self, cloud_node, arvados_node):
"""Keep SLURM's node properties up to date."""
hostname = arvados_node.get("hostname")
- if hostname:
- # This is only needed when slurm has restarted and lost
- # the dynamically configured node properties. So it's
- # usually redundant, but detecting when it's necessary
- # would be about the same amount of work as doing it
- # repetitively.
+ features = arvados_node.get("slurm_node_features", "").split(",")
+ sizefeature = "instancetype=" + cloud_node.size
+ if hostname and sizefeature not in features:
+ # This probably means SLURM has restarted and lost our
+ # dynamically configured node weights and features.
self._update_slurm_size_attrs(hostname, cloud_node.size)
return super(ComputeNodeUpdateActor, self).sync_node(
cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 70ad54d..f56e2e1 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -15,8 +15,9 @@ import arvados.util
class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
"""Actor to poll the Arvados node list.
- This actor regularly polls the list of Arvados node records, and
- sends it to subscribers.
+ This actor regularly polls the list of Arvados node records,
+ augments it with the latest SLURM node info (`sinfo`), and sends
+ it to subscribers.
"""
def is_common_error(self, exception):
@@ -29,28 +30,32 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
nodelist = arvados.util.list_all(self._client.nodes().list)
# node hostname, state
- sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n %t"])
+ sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
nodestates = {}
+ nodefeatures = {}
for out in sinfo_out.splitlines():
try:
- nodename, state = out.split(" ", 2)
- if state in ('alloc', 'alloc*',
- 'comp', 'comp*',
- 'mix', 'mix*',
- 'drng', 'drng*'):
- nodestates[nodename] = 'busy'
- elif state in ('idle', 'fail'):
- nodestates[nodename] = state
- else:
- nodestates[nodename] = 'down'
+ nodename, state, features = out.split("|", 3)
except ValueError:
- pass
+ continue
+ if state in ('alloc', 'alloc*',
+ 'comp', 'comp*',
+ 'mix', 'mix*',
+ 'drng', 'drng*'):
+ nodestates[nodename] = 'busy'
+ elif state in ('idle', 'fail'):
+ nodestates[nodename] = state
+ else:
+ nodestates[nodename] = 'down'
+ if features != "(null)":
+ nodefeatures[nodename] = features
for n in nodelist:
if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
n["crunch_worker_state"] = nodestates[n["hostname"]]
else:
n["crunch_worker_state"] = 'down'
+ n["slurm_node_features"] = nodefeatures.get(n["hostname"])
return nodelist
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list