[ARVADOS] updated: 1.1.2-142-gfa2effb

Git user git at public.curoverse.com
Fri Feb 2 16:47:54 EST 2018


Summary of changes:
 .../arvnodeman/computenode/dispatch/slurm.py       | 11 ++++----
 services/nodemanager/arvnodeman/nodelist.py        | 33 +++++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

       via  fa2effb40b8fc62e239f2746e9626569e27d9d62 (commit)
      from  3ccc70f4bb06bab6c0b3c71f555cba24cc5c6a47 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit fa2effb40b8fc62e239f2746e9626569e27d9d62
Author: Tom Clegg <tclegg at veritasgenetics.com>
Date:   Fri Feb 2 16:46:32 2018 -0500

    12199: Track slurm node features, avoid redundant updates.
    
    Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
index 13a2a6a..76d92e2 100644
--- a/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
+++ b/services/nodemanager/arvnodeman/computenode/dispatch/slurm.py
@@ -108,12 +108,11 @@ class ComputeNodeUpdateActor(SlurmMixin, UpdateActorBase):
     def sync_node(self, cloud_node, arvados_node):
         """Keep SLURM's node properties up to date."""
         hostname = arvados_node.get("hostname")
-        if hostname:
-            # This is only needed when slurm has restarted and lost
-            # the dynamically configured node properties. So it's
-            # usually redundant, but detecting when it's necessary
-            # would be about the same amount of work as doing it
-            # repetitively.
+        features = arvados_node.get("slurm_node_features", "").split(",")
+        sizefeature = "instancetype=" + cloud_node.size
+        if hostname and sizefeature not in features:
+            # This probably means SLURM has restarted and lost our
+            # dynamically configured node weights and features.
             self._update_slurm_size_attrs(hostname, cloud_node.size)
         return super(ComputeNodeUpdateActor, self).sync_node(
             cloud_node, arvados_node)
diff --git a/services/nodemanager/arvnodeman/nodelist.py b/services/nodemanager/arvnodeman/nodelist.py
index 70ad54d..f56e2e1 100644
--- a/services/nodemanager/arvnodeman/nodelist.py
+++ b/services/nodemanager/arvnodeman/nodelist.py
@@ -15,8 +15,9 @@ import arvados.util
 class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
     """Actor to poll the Arvados node list.
 
-    This actor regularly polls the list of Arvados node records, and
-    sends it to subscribers.
+    This actor regularly polls the list of Arvados node records,
+    augments it with the latest SLURM node info (`sinfo`), and sends
+    it to subscribers.
     """
 
     def is_common_error(self, exception):
@@ -29,28 +30,32 @@ class ArvadosNodeListMonitorActor(clientactor.RemotePollLoopActor):
         nodelist = arvados.util.list_all(self._client.nodes().list)
 
         # node hostname, state
-        sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n %t"])
+        sinfo_out = subprocess.check_output(["sinfo", "--noheader", "--format=%n|%t|%f"])
         nodestates = {}
+        nodefeatures = {}
         for out in sinfo_out.splitlines():
             try:
-                nodename, state = out.split(" ", 2)
-                if state in ('alloc', 'alloc*',
-                             'comp',  'comp*',
-                             'mix',   'mix*',
-                             'drng',  'drng*'):
-                    nodestates[nodename] = 'busy'
-                elif state in ('idle', 'fail'):
-                    nodestates[nodename] = state
-                else:
-                    nodestates[nodename] = 'down'
+                nodename, state, features = out.split("|", 3)
             except ValueError:
-                pass
+                continue
+            if state in ('alloc', 'alloc*',
+                         'comp',  'comp*',
+                         'mix',   'mix*',
+                         'drng',  'drng*'):
+                nodestates[nodename] = 'busy'
+            elif state in ('idle', 'fail'):
+                nodestates[nodename] = state
+            else:
+                nodestates[nodename] = 'down'
+            if features != "(null)":
+                nodefeatures[nodename] = features
 
         for n in nodelist:
             if n["slot_number"] and n["hostname"] and n["hostname"] in nodestates:
                 n["crunch_worker_state"] = nodestates[n["hostname"]]
             else:
                 n["crunch_worker_state"] = 'down'
+            n["slurm_node_features"] = nodefeatures.get(n["hostname"])
 
         return nodelist
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list