[ARVADOS] updated: 82c4697bf24b10f3fb66d303ae73499095b5742a

git at public.curoverse.com git at public.curoverse.com
Tue Jun 10 14:13:10 EDT 2014


Summary of changes:
 services/api/script/crunch-dispatch.rb | 97 +++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 19 deletions(-)

       via  82c4697bf24b10f3fb66d303ae73499095b5742a (commit)
       via  505f5c37bb9fe1fe93f8bdbd2d2072e783832f20 (commit)
       via  541ce54ba2c7c8f9783da04f947ccf055b72ae2c (commit)
       via  35db7f5c7f1d62f996550f51fc4f0dd4f77627fb (commit)
      from  7e865395a4ccf9f17b904c3700064328d52db121 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 82c4697bf24b10f3fb66d303ae73499095b5742a
Merge: 7e86539 505f5c3
Author: Brett Smith <brett at curoverse.com>
Date:   Tue Jun 10 14:12:31 2014 -0400

    Merge branch '2880-crunch-dispatch-node-constraints'
    
    Closes #2880, #2976, #2993.


commit 505f5c37bb9fe1fe93f8bdbd2d2072e783832f20
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 13:18:38 2014 -0400

    2880: Don't dispatch Jobs until runtime constraints are met.
    
    This retains the same FIFO approach to the Job queue that
    crunch-dispatch currently uses, but now when it encounters a Job whose
    constraints are not met:
    
    * it may wait for a while to see if the Node Manager makes Nodes
      available, if it hasn't done that this hour; and
    
    * it leaves that Job in the queue and tries to process the next one.
    
    See #2880 for further background.  The exact parameters of "waiting
    for Nodes" will probably need tuning, but that will be easier to do
    after it's been in production for a while.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 11d5540..59e3aff 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -65,7 +65,6 @@ class Dispatcher
 
   def update_node_status
     if Server::Application.config.crunch_job_wrapper.to_s.match /^slurm/
-      @nodes_in_state = {idle: 0, alloc: 0, down: 0}
       @node_state ||= {}
       node_seen = {}
       begin
@@ -78,9 +77,6 @@ class Dispatcher
           next if node_seen[re[1]]
           node_seen[re[1]] = true
 
-          # count nodes in each state
-          @nodes_in_state[re[2].to_sym] += 1
-
           # update our database (and cache) when a node's state changes
           if @node_state[re[1]] != re[2]
             @node_state[re[1]] = re[2]
@@ -102,40 +98,99 @@ class Dispatcher
     end
   end
 
-  def start_jobs
-    @todo.each do |job|
+  def positive_int(raw_value, default=nil)
+    value = begin raw_value.to_i rescue 0 end
+    if value > 0
+      value
+    else
+      default
+    end
+  end
 
-      min_nodes = 1
-      begin
-        if job.runtime_constraints['min_nodes']
-          min_nodes = begin job.runtime_constraints['min_nodes'].to_i rescue 1 end
+  NODE_CONSTRAINT_MAP = {
+    # Map Job runtime_constraints keys to the corresponding Node info key.
+    'min_ram_mb_per_node' => 'total_ram_mb',
+    'min_scratch_mb_per_node' => 'total_scratch_mb',
+    'min_cores_per_node' => 'total_cpu_cores',
+  }
+
+  def nodes_available_for_job_now(job)
+    # Find Nodes that satisfy a Job's runtime constraints (by building
+    # a list of Procs and using them to test each Node).  If there
+    # enough to run the Job, return an array of their names.
+    # Otherwise, return nil.
+    need_procs = NODE_CONSTRAINT_MAP.each_pair.map do |job_key, node_key|
+      Proc.new do |node|
+        positive_int(node.info[node_key], 0) >=
+          positive_int(job.runtime_constraints[job_key], 0)
+      end
+    end
+    min_node_count = positive_int(job.runtime_constraints['min_nodes'], 1)
+    usable_nodes = []
+    Node.find_each do |node|
+      good_node = (node.info['slurm_state'] == 'idle')
+      need_procs.each { |node_test| good_node &&= node_test.call(node) }
+      if good_node
+        usable_nodes << node
+        if usable_nodes.count >= min_node_count
+          return usable_nodes.map { |node| node.hostname }
         end
       end
+    end
+    nil
+  end
 
-      begin
-        next if @nodes_in_state[:idle] < min_nodes
-      rescue
-      end
+  def nodes_available_for_job(job)
+    # Check if there are enough idle nodes with the Job's minimum
+    # hardware requirements to run it.  If so, return an array of
+    # their names.  If not, up to once per hour, signal start_jobs to
+    # hold off launching Jobs.  This delay is meant to give the Node
+    # Manager an opportunity to make new resources available for new
+    # Jobs.
+    #
+    # The exact timing parameters here might need to be adjusted for
+    # the best balance between helping the longest-waiting Jobs run,
+    # and making efficient use of immediately available resources.
+    # These are all just first efforts until we have more data to work
+    # with.
+    nodelist = nodes_available_for_job_now(job)
+    if nodelist.nil? and not did_recently(:wait_for_available_nodes, 3600)
+      $stderr.puts "dispatch: waiting for nodes for #{job.uuid}"
+      @node_wait_deadline = Time.now + 5.minutes
+    end
+    nodelist
+  end
 
+  def start_jobs
+    @todo.each do |job|
       next if @running[job.uuid]
-      next if !take(job)
 
       cmd_args = nil
       case Server::Application.config.crunch_job_wrapper
       when :none
         cmd_args = []
       when :slurm_immediate
+        nodelist = nodes_available_for_job(job)
+        if nodelist.nil?
+          if Time.now < @node_wait_deadline
+            break
+          else
+            next
+          end
+        end
         cmd_args = ["salloc",
                     "--chdir=/",
                     "--immediate",
                     "--exclusive",
                     "--no-kill",
                     "--job-name=#{job.uuid}",
-                    "--nodes=#{min_nodes}"]
+                    "--nodelist=#{nodelist.join(',')}"]
       else
         raise "Unknown crunch_job_wrapper: #{Server::Application.config.crunch_job_wrapper}"
       end
 
+      next if !take(job)
+
       if Server::Application.config.crunch_job_user
         cmd_args.unshift("sudo", "-E", "-u",
                          Server::Application.config.crunch_job_user,
@@ -212,6 +267,7 @@ class Dispatcher
         stderr_flushed_at: 0
       }
       i.close
+      update_node_status
     end
   end
 

commit 541ce54ba2c7c8f9783da04f947ccf055b72ae2c
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 12:03:54 2014 -0400

    2880: Improve error reporting when crunch-dispatch updates node status.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index c40c62d..11d5540 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -88,13 +88,16 @@ class Dispatcher
             if node
               $stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
               node.info['slurm_state'] = re[2]
-              node.save
+              if not node.save
+                $stderr.puts "dispatch: failed to update #{node.uuid}: #{node.errors.messages}"
+              end
             elsif re[2] != 'down'
               $stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"
             end
           end
         end
-      rescue
+      rescue => error
+        $stderr.puts "dispatch: error updating node status: #{error}"
       end
     end
   end

commit 35db7f5c7f1d62f996550f51fc4f0dd4f77627fb
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 12:03:08 2014 -0400

    2880: crunch-dispatch must stop putting symbols in the database.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 3ddf83d..c40c62d 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -87,7 +87,7 @@ class Dispatcher
             node = Node.where('hostname=?', re[1]).first
             if node
               $stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
-              node.info[:slurm_state] = re[2]
+              node.info['slurm_state'] = re[2]
               node.save
             elsif re[2] != 'down'
               $stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list