[ARVADOS] created: fb5d279228a7adcafd858d9e137accd8010bb382

Git user git at public.curoverse.com
Thu May 11 11:38:47 EDT 2017


        at  fb5d279228a7adcafd858d9e137accd8010bb382 (commit)


commit fb5d279228a7adcafd858d9e137accd8010bb382
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu May 11 11:38:37 2017 -0400

    11590: Add "reuse jobs even if conflicting outputs exist" and "log reuse decisions" config options.

diff --git a/services/api/app/models/job.rb b/services/api/app/models/job.rb
index 90fba3c..51f2df4 100644
--- a/services/api/app/models/job.rb
+++ b/services/api/app/models/job.rb
@@ -198,6 +198,16 @@ class Job < ArvadosModel
     filters
   end
 
+  # reuselog logs whatever the given block returns, if reuse logging
+  # is enabled. It accepts a block instead of a string because in some
+  # cases constructing the strings involves doing database queries,
+  # and we want to skip those queries when logging is disabled.
+  def self.reuselog
+    if Rails.configuration.log_reuse_decisions
+      Rails.logger.info("find_reusable: " + yield)
+    end
+  end
+
   def self.find_reusable attrs, params, filters, read_users
     if filters.empty?  # Translate older creation parameters into filters.
       filters =
@@ -237,32 +247,57 @@ class Job < ArvadosModel
     end
 
     # Search for a reusable Job, and return it if found.
-    candidates = Job.
-      readable_by(current_user).
-      where('state = ? or (owner_uuid = ? and state in (?))',
-            Job::Complete, current_user.uuid, [Job::Queued, Job::Running]).
-      where('script_parameters_digest = ?', Job.sorted_hash_digest(attrs[:script_parameters])).
-      where('nondeterministic is distinct from ?', true).
-      order('state desc, created_at') # prefer Running jobs over Queued
+    candidates = Job.readable_by(current_user).where(
+      'state = ? or (owner_uuid = ? and state in (?))',
+      Job::Complete, current_user.uuid, [Job::Queued, Job::Running])
+    reuselog { "have #{candidates.count} readable jobs in a reusable state" }
+
+    digest = Job.sorted_hash_digest(attrs[:script_parameters])
+    candidates = candidates.where('script_parameters_digest = ?', digest)
+    reuselog { "have #{candidates.count} candidates after filtering on script_parameters_digest #{digest}" }
+
+    candidates = candidates.where('nondeterministic is distinct from ?', true)
+    reuselog { "have #{candidates.count} candidates after filtering on !nondeterministic" }
+
+    # prefer Running jobs over Queued
+    candidates = candidates.order('state desc, created_at')
+
     candidates = apply_filters candidates, filters
+    reuselog { "have #{candidates.count} candidates after filtering on repo, script, and custom filters #{filters.inspect}" }
+
     chosen = nil
     incomplete_job = nil
     candidates.each do |j|
       if j.state != Job::Complete
-        # We'll use this if we don't find a job that has completed
-        incomplete_job ||= j
-        next
-      end
-
-      if chosen == false
-        # We have already decided not to reuse any completed job
-        next
+        if !incomplete_job
+          # We'll use this if we don't find a job that has completed
+          reuselog { "job #{j.uuid} is reusable, but unfinished; continuing search for completed jobs" }
+          incomplete_job = j
+        else
+          reuselog { "job #{j.uuid} is reusable, but unfinished, so not better than #{incomplete_job.uuid}; ignoring" }
+        end
+      elsif chosen == false
+        # Ignore: we have already decided not to reuse any completed
+        # job.
+        reuselog { "job #{j.uuid} output #{j.output} ignored, see above" }
+      elsif Rails.configuration.reuse_job_if_outputs_differ
+        if Collection.readable_by(current_user).find_by_portable_data_hash(j.output)
+          reuselog { "job #{j.uuid} with output #{j.output} is reusable; decision is final." }
+          return j
+        else
+          # Ignore: keep locking for an incomplete job or one whose
+          # output is readable.
+          reuselog { "job #{j.uuid} output #{j.output} unavailable to user; continuing search" }
+        end
       elsif chosen
         if chosen.output != j.output
           # If two matching jobs produced different outputs, run a new
           # job (or use one that's already running/queued) instead of
           # choosing one arbitrarily.
+          reuselog { "job #{j.uuid} output #{j.output} disagrees; forgetting about #{chosen.uuid} and ignoring any other finished jobs (see reuse_job_if_outputs_differ in application.default.yml)" }
           chosen = false
+        else
+          reuselog { "job #{j.uuid} output #{j.output} agrees with chosen #{chosen.uuid}; continuing search in case others disagree" }
         end
         # ...and that's the only thing we need to do once we've chosen
         # a job to reuse.
@@ -270,12 +305,20 @@ class Job < ArvadosModel
         # As soon as the output we will end up returning (if any) is
         # decided, check whether it will be visible to the user; if
         # not, any further investigation of reusable jobs is futile.
+        reuselog { "job #{j.uuid} output #{j.output} unavailable to user; now refusing to reuse any finished job" }
         chosen = false
       else
+        reuselog { "job #{j.uuid} output #{j.output} can be reused; continuing search in case others disagree" }
         chosen = j
       end
     end
-    chosen || incomplete_job
+    j = chosen || incomplete_job
+    if j
+      reuselog { "done, #{j.uuid} was selected" }
+    else
+      reuselog { "done, nothing suitable" }
+    end
+    return j
   end
 
   def self.default_git_filters(attr_name, repo_name, refspec)
diff --git a/services/api/config/application.default.yml b/services/api/config/application.default.yml
index 32c7b24..c2f8958 100644
--- a/services/api/config/application.default.yml
+++ b/services/api/config/application.default.yml
@@ -358,6 +358,27 @@ common:
 
 
   ###
+  ### Job and container reuse logic.
+  ###
+
+  # Include details about job reuse decisions in the server log. This
+  # causes additional database queries to run, so it should not be
+  # enabled unless you expect to examine the resulting logs for
+  # troubleshooting purposes.
+  log_reuse_decisions: false
+
+  # Control job reuse behavior when two completed jobs match the
+  # search criteria and have different outputs.
+  #
+  # If true, in case of a conflict, reuse the earliest job (this is
+  # similar to container reuse behavior).
+  #
+  # If false, in case of a conflict, do not reuse any completed job,
+  # but do reuse an already-running job if available (this is the
+  # original job reuse behavior, and is still the default).
+  reuse_job_if_outputs_differ: false
+
+  ###
   ### Remaining assorted configuration options.
   ###
 
diff --git a/services/api/test/unit/job_test.rb b/services/api/test/unit/job_test.rb
index 9241465..e0a4f84 100644
--- a/services/api/test/unit/job_test.rb
+++ b/services/api/test/unit/job_test.rb
@@ -557,7 +557,18 @@ class JobTest < ActiveSupport::TestCase
     assert_equal Job.deep_sort_hash(a).to_json, Job.deep_sort_hash(b).to_json
   end
 
-  test 'find_reusable' do
+  test 'find_reusable without logging' do
+    Rails.logger.expects(:info).never
+    try_find_reusable
+  end
+
+  test 'find_reusable with logging' do
+    Rails.configuration.log_reuse_decisions = true
+    Rails.logger.expects(:info).at_least(3)
+    try_find_reusable
+  end
+
+  def try_find_reusable
     foobar = jobs(:foobar)
     example_attrs = {
       script_version: foobar.script_version,
@@ -577,6 +588,11 @@ class JobTest < ActiveSupport::TestCase
     Job.where(uuid: jobs(:job_with_latest_version).uuid).
       update_all(output: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa+1')
     assert_nil Job.find_reusable(example_attrs, {}, [], [users(:active)])
+
+    # ...unless config says to reuse the earlier job in such cases.
+    Rails.configuration.reuse_job_if_outputs_differ = true
+    j = Job.find_reusable(example_attrs, {}, [], [users(:active)])
+    assert_equal foobar.uuid, j.uuid
   end
 
   [

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list