[ARVADOS] created: 6e873d993ddcb2a202013e6d49eda792dac70c21

git at public.curoverse.com git at public.curoverse.com
Fri Jun 6 13:17:52 EDT 2014


        at  6e873d993ddcb2a202013e6d49eda792dac70c21 (commit)


commit 6e873d993ddcb2a202013e6d49eda792dac70c21
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 13:18:38 2014 -0400

    2880: Don't dispatch Jobs until runtime constraints are met.
    
    This retains the same FIFO approach to the Job queue that
    crunch-dispatch currently uses, but now when it encounters a Job whose
    constraints are not met:
    
    * it may wait for a while to see if the Node Manager makes Nodes
      available, if it hasn't done that this hour; and
    
    * it leaves that Job in the queue and tries to process the next one.
    
    See #2880 for further background.  The exact parameters of "waiting
    for Nodes" will probably need tuning, but that will be easier to do
    after it's been in production for a while.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 11d5540..e3b3a54 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -65,7 +65,6 @@ class Dispatcher
 
   def update_node_status
     if Server::Application.config.crunch_job_wrapper.to_s.match /^slurm/
-      @nodes_in_state = {idle: 0, alloc: 0, down: 0}
       @node_state ||= {}
       node_seen = {}
       begin
@@ -78,9 +77,6 @@ class Dispatcher
           next if node_seen[re[1]]
           node_seen[re[1]] = true
 
-          # count nodes in each state
-          @nodes_in_state[re[2].to_sym] += 1
-
           # update our database (and cache) when a node's state changes
           if @node_state[re[1]] != re[2]
             @node_state[re[1]] = re[2]
@@ -102,23 +98,80 @@ class Dispatcher
     end
   end
 
-  def start_jobs
-    @todo.each do |job|
+  def positive_int(raw_value, default=nil)
+    value = begin raw_value.to_i rescue 0 end
+    if value > 0
+      value
+    else
+      default
+    end
+  end
 
-      min_nodes = 1
-      begin
-        if job.runtime_constraints['min_nodes']
-          min_nodes = begin job.runtime_constraints['min_nodes'].to_i rescue 1 end
+  NODE_CONSTRAINT_MAP = {
+    # Map Job runtime_constraints keys to the corresponding Node info key.
+    'min_ram_mb_per_node' => 'total_ram_mb',
+    'min_scratch_mb_per_node' => 'total_scratch_mb',
+    'min_cores_per_node' => 'total_cpu_cores',
+  }
+
+  def nodes_available_for_job_now(job)
+    # Find Nodes that satisfy a Job's runtime constraints (by building
+    # a list of Procs and using them to test each Node).  If there
+    # enough to run the Job, return an array of their names.
+    # Otherwise, return nil.
+    need_procs = NODE_CONSTRAINT_MAP.each_pair.map do |job_key, node_key|
+      Proc.new do |node|
+        positive_int(node.info[node_key], 0) >=
+          positive_int(job.runtime_constraints[job_key], 0)
+      end
+    end
+    min_node_count = positive_int(job.runtime_constraints['min_nodes'], 1)
+    usable_nodes = []
+    Node.find_each do |node|
+      good_node = (node.info['slurm_state'] == 'idle')
+      need_procs.each { |node_test| good_node &&= node_test.call(node) }
+      if good_node
+        usable_nodes << node
+        if usable_nodes.count >= min_node_count
+          return usable_nodes.map { |node| node.hostname }
         end
       end
+    end
+    nil
+  end
 
-      begin
-        next if @nodes_in_state[:idle] < min_nodes
-      rescue
+  def nodes_available_for_job(job)
+    # Check if there are enough idle nodes with the Job's minimum
+    # hardware requirements to run it.  If so, return an array of
+    # their names.  If not, we'll wait a little bit to see if the Node
+    # Manager makes some available--up to five minutes every
+    # hour--before returning nil.
+    #
+    # The exact timing parameters here might need to be adjusted for
+    # the best balance between helping the longest-waiting Jobs run,
+    # and making efficient use of immediately available resources.
+    # These are all just first efforts until we have more data to work
+    # with.
+    if nodelist = nodes_available_for_job_now(job)
+      nodelist
+    elsif did_recently(:wait_for_available_nodes, 3600)
+      nil
+    else
+      $stderr.puts "dispatch: waiting for nodes for #{job.uuid}"
+      deadline = Time.now + 300
+      while (Time.now < deadline) and not $signal[:term]
+        sleep(60)
+        break if nodelist = nodes_available_for_job_now(job)
       end
+      nodelist
+    end
+  end
 
+  def start_jobs
+    @todo.each do |job|
       next if @running[job.uuid]
-      next if !take(job)
+      nodelist = nodes_available_for_job(job)
+      next if nodelist.nil? or !take(job)
 
       cmd_args = nil
       case Server::Application.config.crunch_job_wrapper
@@ -131,7 +184,7 @@ class Dispatcher
                     "--exclusive",
                     "--no-kill",
                     "--job-name=#{job.uuid}",
-                    "--nodes=#{min_nodes}"]
+                    "--nodelist=#{nodelist.join(',')}"]
       else
         raise "Unknown crunch_job_wrapper: #{Server::Application.config.crunch_job_wrapper}"
       end
@@ -212,6 +265,7 @@ class Dispatcher
         stderr_flushed_at: 0
       }
       i.close
+      update_node_status
     end
   end
 

commit 3a7e410317a6dcf053bd7e2ea220c1a247ececca
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 12:03:54 2014 -0400

    2880: Improve error reporting when crunch-dispatch updates node status.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index c40c62d..11d5540 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -88,13 +88,16 @@ class Dispatcher
             if node
               $stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
               node.info['slurm_state'] = re[2]
-              node.save
+              if not node.save
+                $stderr.puts "dispatch: failed to update #{node.uuid}: #{node.errors.messages}"
+              end
             elsif re[2] != 'down'
               $stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"
             end
           end
         end
-      rescue
+      rescue => error
+        $stderr.puts "dispatch: error updating node status: #{error}"
       end
     end
   end

commit a389678793930c27bce2d16f5149aacad4597959
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Jun 6 12:03:08 2014 -0400

    2880: crunch-dispatch must stop putting symbols in the database.

diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 3ddf83d..c40c62d 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -87,7 +87,7 @@ class Dispatcher
             node = Node.where('hostname=?', re[1]).first
             if node
               $stderr.puts "dispatch: update #{re[1]} state to #{re[2]}"
-              node.info[:slurm_state] = re[2]
+              node.info['slurm_state'] = re[2]
               node.save
             elsif re[2] != 'down'
               $stderr.puts "dispatch: sinfo reports '#{re[1]}' is not down, but no node has that name"

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list