[ARVADOS] created: 1.1.3-222-g9746d8b

Git user git at public.curoverse.com
Mon Mar 19 10:43:20 EDT 2018


        at  9746d8b3d8c259836799656ff6e5f401d9d4d492 (commit)


commit 9746d8b3d8c259836799656ff6e5f401d9d4d492
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Mon Mar 19 11:42:30 2018 -0300

    13166: Limit wishlist to (max_nodes - up_nodes)
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 73b58bf..a6e73e2 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -348,7 +348,8 @@ class NodeManagerDaemonActor(actor_class):
 
     def update_server_wishlist(self, wishlist):
         self._update_poll_time('server_wishlist')
-        self.last_wishlist = wishlist
+        requestable_nodes = self.node_quota - (self._nodes_booting(None) + len(self.cloud_nodes))
+        self.last_wishlist = wishlist[:requestable_nodes]
         for size in reversed(self.server_calculator.cloud_sizes):
             try:
                 nodes_wanted = self._nodes_wanted(size)
@@ -356,7 +357,7 @@ class NodeManagerDaemonActor(actor_class):
                     self._later.start_node(size)
                 elif (nodes_wanted < 0) and self.booting:
                     self._later.stop_booting_node(size)
-            except Exception as e:
+            except Exception:
                 self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
         try:
             self._update_tracker()
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 50fa0aa..614adbe 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -700,7 +700,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
         big = testutil.MockSize(2)
         avail_sizes = [(testutil.MockSize(1), {"cores":1}),
                         (testutil.MockSize(2), {"cores":2})]
-        self.make_daemon(want_sizes=[small, small, small, big],
+        self.make_daemon(want_sizes=[small, small, big, small],
                          avail_sizes=avail_sizes, max_nodes=3)
 
         # the daemon runs in another thread, so we need to wait and see

commit 2dd214985e9727bf0b5fbf11b0e39c3e7d3cc5c4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date:   Fri Mar 16 17:32:18 2018 -0300

    13166: Order slurm queue by priority
    
    Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>

diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 20849c9..8f3d7b9 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -154,10 +154,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
         queuelist = []
         if self.slurm_queue:
             # cpus, memory, tempory disk space, reason, job name, feature constraints
-            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"])
+            squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"])
             for out in squeue_out.splitlines():
                 try:
-                    cpu, ram, disk, reason, jobname, features = out.split("|", 5)
+                    cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6)
                 except ValueError:
                     self._logger.warning("ignored malformed line in squeue output: %r", out)
                     continue
@@ -177,7 +177,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                         "uuid": jobname,
                         "runtime_constraints": {
                             "instance_type": instance_type,
-                        }
+                        },
+                        "priority": int(priority)
                     })
                     break
                 else:
@@ -189,8 +190,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
                             "min_cores_per_node": cpu,
                             "min_ram_mb_per_node": self.coerce_to_mb(ram),
                             "min_scratch_mb_per_node": self.coerce_to_mb(disk)
-                        }
+                        },
+                        "priority": int(priority)
                     })
+            queuelist = sorted(queuelist, key=lambda x: x.get('priority', 1), reverse=True)
 
         if self.jobs_queue:
             queuelist.extend(self._client.jobs().queue().execute()['items'])
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index 7b8ba39..f188f03 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -58,14 +58,14 @@ def update_script(path, val):
 def set_squeue(g):
     global all_jobs
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     return 0
 
 def set_queue_unsatisfiable(g):
     global all_jobs, unsatisfiable_job_scancelled
     # Simulate a job requesting a 99 core node.
     update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
-                  "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+                  "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
     update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
                   "\ntouch %s" % unsatisfiable_job_scancelled)
     return 0
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
index 5223245..8c10f1b 100644
--- a/services/nodemanager/tests/test_jobqueue.py
+++ b/services/nodemanager/tests/test_jobqueue.py
@@ -159,7 +159,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
     def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
         job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
         container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
-        mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "|\n"
+        mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "||1234567890\n"
 
         self.build_monitor([{'items': [{'uuid': job_uuid}]}],
                            self.MockCalculatorUnsatisfiableJobs(), True, True)
@@ -181,8 +181,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
 
     @mock.patch("subprocess.check_output")
     def test_squeue_server_list(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)
-2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)
+        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
+2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
 """
 
         super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
@@ -195,8 +195,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
 
     @mock.patch("subprocess.check_output")
     def test_squeue_server_list_suffix(self, mock_squeue):
-        mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)
-1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)
+        mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
+1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
 """
 
         super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
@@ -209,7 +209,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
 
     @mock.patch("subprocess.check_output")
     def test_squeue_server_list_instancetype_constraint(self, mock_squeue):
-        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test\n"""
+        mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n"""
         super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
             [(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
                                                                 True, True)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list