[ARVADOS] updated: 82c4697bf24b10f3fb66d303ae73499095b5742a
git at public.curoverse.com
git at public.curoverse.com
Tue Jun 10 14:13:10 EDT 2014
Summary of changes:
services/api/script/crunch-dispatch.rb | 97 +++++++++++++++++++++++++++-------
1 file changed, 78 insertions(+), 19 deletions(-)
via 82c4697bf24b10f3fb66d303ae73499095b5742a (commit)
via 505f5c37bb9fe1fe93f8bdbd2d2072e783832f20 (commit)
via 541ce54ba2c7c8f9783da04f947ccf055b72ae2c (commit)
via 35db7f5c7f1d62f996550f51fc4f0dd4f77627fb (commit)
from 7e865395a4ccf9f17b904c3700064328d52db121 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 82c4697bf24b10f3fb66d303ae73499095b5742a
Merge: 7e86539 505f5c3
Author: Brett Smith <brett at curoverse.com>
Date: Tue Jun 10 14:12:31 2014 -0400
Merge branch '2880-crunch-dispatch-node-constraints'
Closes #2880, #2976, #2993.
commit 505f5c37bb9fe1fe93f8bdbd2d2072e783832f20
Author: Brett Smith <brett at curoverse.com>
Date: Fri Jun 6 13:18:38 2014 -0400
2880: Don't dispatch Jobs until runtime constraints are met.
This retains the same FIFO approach to the Job queue that
crunch-dispatch currently uses, but now when it encounters a Job whose
constraints are not met:
* it may wait for a while to see if the Node Manager makes Nodes
available, if it hasn't done that this hour; and
* it leaves that Job in the queue and tries to process the next one.
See #2880 for further background. The exact parameters of "waiting
for Nodes" will probably need tuning, but that will be easier to do
after it's been in production for a while.
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 11d5540..59e3aff 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -65,7 +65,6 @@ class Dispatcher
def update_node_status
if Server::Application.config.crunch_job_wrapper.to_s.match /^slurm/
- @nodes_in_state = {idle: 0, alloc: 0, down: 0}
@node_state ||= {}
node_seen = {}
begin
@@ -78,9 +77,6 @@ class Dispatcher
next if node_seen[re[1]]
node_seen[re[1]] = true
- # count nodes in each state
- @nodes_in_state[re[2].to_sym] += 1
-
# update our database (and cache) when a node's state changes
if @node_state[re[1]] != re[2]
@node_state[re[1]] = re[2]
@@ -102,40 +98,99 @@ class Dispatcher
end
end
- def start_jobs
- @todo.each do |job|
+ def positive_int(raw_value, default=nil)
+ value = begin raw_value.to_i rescue 0 end
+ if value > 0
+ value
+ else
+ default
+ end
+ end
- min_nodes = 1
- begin
- if job.runtime_constraints['min_nodes']
- min_nodes = begin job.runtime_constraints['min_nodes'].to_i rescue 1 end
+ NODE_CONSTRAINT_MAP = {
+ # Map Job runtime_constraints keys to the corresponding Node info key.
+ 'min_ram_mb_per_node' => 'total_ram_mb',
+ 'min_scratch_mb_per_node' => 'total_scratch_mb',
+ 'min_cores_per_node' => 'total_cpu_cores',
+ }
+
+ def nodes_available_for_job_now(job)
+ # Find Nodes that satisfy a Job's runtime constraints (by building
+ # a list of Procs and using them to test each Node). If there
+ # enough to run the Job, return an array of their names.
+ # Otherwise, return nil.
+ need_procs = NODE_CONSTRAINT_MAP.each_pair.map do |job_key, node_key|
+ Proc.new do |node|
+ positive_int(node.info[node_key], 0) >=
+ positive_int(job.runtime_constraints[job_key], 0)
+ end
+ end
+ min_node_count = positive_int(job.runtime_constraints['min_nodes'], 1)
+ usable_nodes = []
+ Node.find_each do |node|
+ good_node = (node.info['slurm_state'] == 'idle')
+ need_procs.each { |node_test| good_node &&= node_test.call(node) }
+ if good_node
+ usable_nodes << node
+ if usable_nodes.count >= min_node_count
+ return usable_nodes.map { |node| node.hostname }
end
end
+ end
+ nil
+ end
- begin
- next if @nodes_in_state[:idle] < min_nodes
- rescue
- end
+ def nodes_available_for_job(job)
+ # Check if there are enough idle nodes with the Job's minimum
+ # hardware requirements to run it. If so, return an array of
+ # their names. If not, up to once per hour, signal start_jobs to
+ # hold off launching Jobs. This delay is meant to give the Node
+ # Manager an opportunity to make new resources available for new
+ # Jobs.
+ #
+ # The exact timing parameters here might need to be adjusted for
+ # the best balance between helping the longest-waiting Jobs run,
+ # and making efficient use of immediately available resources.
+ # These are all just first efforts until we have more data to work
+ # with.
+ nodelist = nodes_available_for_job_now(job)
+ if nodelist.nil? and not did_recently(:wait_for_available_nodes, 3600)
+ $stderr.puts "dispatch: waiting for nodes for #{job.uuid}"
+ @node_wait_deadline = Time.now + 5.minutes
+ end
+ nodelist
+ end
+ def start_jobs
+ @todo.each do |job|
next if @running[job.uuid]
- next if !take(job)
cmd_args = nil
case Server::Application.config.crunch_job_wrapper
when :none
cmd_args = []
when :slurm_immediate
+ nodelist = nodes_available_for_job(job)
+ if nodelist.nil?
+ if Time.now < @node_wait_deadline
+ break
+ else
+ next
+ end
+ end
cmd_args = ["salloc",
"--chdir=/",
"--immediate",
"--exclusive",
"--no-kill",
"--job-name=#{job.uuid}",
- "--nodes=#{min_nodes}"]
+ "--nodelist=#{nodelist.join(',')}"]
else
raise "Unknown crunch_job_wrapper: #{Server::Application.config.crunch_job_wrapper}"
end
+ next if !take(job)
+
if Server::Application.config.crunch_job_user
cmd_args.unshift("sudo", "-E", "-u",
Server::Application.config.crunch_job_user,
@@ -212,6 +267,7 @@ class Dispatcher
stderr_flushed_at: 0
}
i.close
+ update_node_status
end
end
commit 541ce54ba2c7c8f9783da04f947ccf055b72ae2c
Author: Brett Smith <brett at curoverse.com>
Date: Fri Jun 6 12:03:54 2014 -0400
2880: Improve error reporting when crunch-dispatch updates node status.
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index c40c62d..11d5540 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -88,13 +88,16 @@ class Dispatcher
if node
$stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
node.info['slurm_state'] = re[2]
- node.save
+ if not node.save
+ $stderr.puts "dispatch: failed to update #{node.uuid}: #{node.errors.messages}"
+ end
elsif re[2] != 'down'
$stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"
end
end
end
- rescue
+ rescue => error
+ $stderr.puts "dispatch: error updating node status: #{error}"
end
end
end
commit 35db7f5c7f1d62f996550f51fc4f0dd4f77627fb
Author: Brett Smith <brett at curoverse.com>
Date: Fri Jun 6 12:03:08 2014 -0400
2880: crunch-dispatch must stop putting symbols in the database.
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 3ddf83d..c40c62d 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -87,7 +87,7 @@ class Dispatcher
node = Node.where('hostname=?', re[1]).first
if node
$stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
- node.info[:slurm_state] = re[2]
+ node.info['slurm_state'] = re[2]
node.save
elsif re[2] != 'down'
$stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list