[ARVADOS] created: 1.1.3-222-g9746d8b
Git user
git at public.curoverse.com
Mon Mar 19 10:43:20 EDT 2018
at 9746d8b3d8c259836799656ff6e5f401d9d4d492 (commit)
commit 9746d8b3d8c259836799656ff6e5f401d9d4d492
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Mon Mar 19 11:42:30 2018 -0300
13166: Limit wishlist to (max_nodes - up_nodes)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/daemon.py b/services/nodemanager/arvnodeman/daemon.py
index 73b58bf..a6e73e2 100644
--- a/services/nodemanager/arvnodeman/daemon.py
+++ b/services/nodemanager/arvnodeman/daemon.py
@@ -348,7 +348,8 @@ class NodeManagerDaemonActor(actor_class):
def update_server_wishlist(self, wishlist):
self._update_poll_time('server_wishlist')
- self.last_wishlist = wishlist
+ requestable_nodes = self.node_quota - (self._nodes_booting(None) + len(self.cloud_nodes))
+ self.last_wishlist = wishlist[:requestable_nodes]
for size in reversed(self.server_calculator.cloud_sizes):
try:
nodes_wanted = self._nodes_wanted(size)
@@ -356,7 +357,7 @@ class NodeManagerDaemonActor(actor_class):
self._later.start_node(size)
elif (nodes_wanted < 0) and self.booting:
self._later.stop_booting_node(size)
- except Exception as e:
+ except Exception:
self._logger.exception("while calculating nodes wanted for size %s", getattr(size, "id", "(id not available)"))
try:
self._update_tracker()
diff --git a/services/nodemanager/tests/test_daemon.py b/services/nodemanager/tests/test_daemon.py
index 50fa0aa..614adbe 100644
--- a/services/nodemanager/tests/test_daemon.py
+++ b/services/nodemanager/tests/test_daemon.py
@@ -700,7 +700,7 @@ class NodeManagerDaemonActorTestCase(testutil.ActorTestMixin,
big = testutil.MockSize(2)
avail_sizes = [(testutil.MockSize(1), {"cores":1}),
(testutil.MockSize(2), {"cores":2})]
- self.make_daemon(want_sizes=[small, small, small, big],
+ self.make_daemon(want_sizes=[small, small, big, small],
avail_sizes=avail_sizes, max_nodes=3)
# the daemon runs in another thread, so we need to wait and see
commit 2dd214985e9727bf0b5fbf11b0e39c3e7d3cc5c4
Author: Lucas Di Pentima <ldipentima at veritasgenetics.com>
Date: Fri Mar 16 17:32:18 2018 -0300
13166: Order slurm queue by priority
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <ldipentima at veritasgenetics.com>
diff --git a/services/nodemanager/arvnodeman/jobqueue.py b/services/nodemanager/arvnodeman/jobqueue.py
index 20849c9..8f3d7b9 100644
--- a/services/nodemanager/arvnodeman/jobqueue.py
+++ b/services/nodemanager/arvnodeman/jobqueue.py
@@ -154,10 +154,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
queuelist = []
if self.slurm_queue:
# cpus, memory, tempory disk space, reason, job name, feature constraints
- squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f"])
+ squeue_out = subprocess.check_output(["squeue", "--state=PENDING", "--noheader", "--format=%c|%m|%d|%r|%j|%f|%Q"])
for out in squeue_out.splitlines():
try:
- cpu, ram, disk, reason, jobname, features = out.split("|", 5)
+ cpu, ram, disk, reason, jobname, features, priority = out.split("|", 6)
except ValueError:
self._logger.warning("ignored malformed line in squeue output: %r", out)
continue
@@ -177,7 +177,8 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
"uuid": jobname,
"runtime_constraints": {
"instance_type": instance_type,
- }
+ },
+ "priority": int(priority)
})
break
else:
@@ -189,8 +190,10 @@ class JobQueueMonitorActor(clientactor.RemotePollLoopActor):
"min_cores_per_node": cpu,
"min_ram_mb_per_node": self.coerce_to_mb(ram),
"min_scratch_mb_per_node": self.coerce_to_mb(disk)
- }
+ },
+ "priority": int(priority)
})
+ queuelist = sorted(queuelist, key=lambda x: x.get('priority', 1), reverse=True)
if self.jobs_queue:
queuelist.extend(self._client.jobs().queue().execute()['items'])
diff --git a/services/nodemanager/tests/integration_test.py b/services/nodemanager/tests/integration_test.py
index 7b8ba39..f188f03 100755
--- a/services/nodemanager/tests/integration_test.py
+++ b/services/nodemanager/tests/integration_test.py
@@ -58,14 +58,14 @@ def update_script(path, val):
def set_squeue(g):
global all_jobs
update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
- "\n".join("echo '1|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+ "\n".join("echo '1|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
return 0
def set_queue_unsatisfiable(g):
global all_jobs, unsatisfiable_job_scancelled
# Simulate a job requesting a 99 core node.
update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n" +
- "\n".join("echo '99|100|100|%s|%s|(null)'" % (v, k) for k,v in all_jobs.items()))
+ "\n".join("echo '99|100|100|%s|%s|(null)|1234567890'" % (v, k) for k,v in all_jobs.items()))
update_script(os.path.join(fake_slurm, "scancel"), "#!/bin/sh\n" +
"\ntouch %s" % unsatisfiable_job_scancelled)
return 0
diff --git a/services/nodemanager/tests/test_jobqueue.py b/services/nodemanager/tests/test_jobqueue.py
index 5223245..8c10f1b 100644
--- a/services/nodemanager/tests/test_jobqueue.py
+++ b/services/nodemanager/tests/test_jobqueue.py
@@ -159,7 +159,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
- mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "|\n"
+ mock_squeue.return_value = "1|1024|0|(Resources)|" + container_uuid + "||1234567890\n"
self.build_monitor([{'items': [{'uuid': job_uuid}]}],
self.MockCalculatorUnsatisfiableJobs(), True, True)
@@ -181,8 +181,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
@mock.patch("subprocess.check_output")
def test_squeue_server_list(self, mock_squeue):
- mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)
-2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)
+ mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
+2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
"""
super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
@@ -195,8 +195,8 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
@mock.patch("subprocess.check_output")
def test_squeue_server_list_suffix(self, mock_squeue):
- mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)
-1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)
+ mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
+1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
"""
super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
@@ -209,7 +209,7 @@ class JobQueueMonitorActorTestCase(testutil.RemotePollLoopActorTestMixin,
@mock.patch("subprocess.check_output")
def test_squeue_server_list_instancetype_constraint(self, mock_squeue):
- mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test\n"""
+ mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n"""
super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
[(testutil.MockSize(n), {'cores': n, 'ram': n*1024, 'scratch': n}) for n in range(1, 3)]),
True, True)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list