[ARVADOS] updated: 55db20860c90c60dd231083c1cc81001c3fbe971

git at public.curoverse.com git at public.curoverse.com
Mon Aug 11 18:40:34 EDT 2014


Summary of changes:
 apps/workbench/app/helpers/application_helper.rb   |   2 +-
 .../app/views/application/_choose.html.erb         |  13 +-
 .../test/integration/pipeline_instances_test.rb    |   1 +
 doc/_config.yml                                    |   2 +-
 ...py.liquid => _concurrent_hash_script_py.liquid} |   0
 .../topics/tutorial-job-debug.html.textile.liquid  | 163 --------------------
 .../topics/tutorial-parallel.html.textile.liquid   |  22 +--
 .../tutorial-firstscript.html.textile.liquid       | 165 ++++++++-------------
 ...uid => tutorial-submit-job.html.textile.liquid} |  11 +-
 sdk/cli/bin/crunch-job                             |  24 +--
 services/api/script/crunch-dispatch.rb             |  15 +-
 11 files changed, 117 insertions(+), 301 deletions(-)
 rename doc/_includes/{_parallel_hash_script_py.liquid => _concurrent_hash_script_py.liquid} (100%)
 delete mode 100644 doc/user/topics/tutorial-job-debug.html.textile.liquid
 copy doc/user/tutorials/{tutorial-firstscript.html.textile.liquid => tutorial-submit-job.html.textile.liquid} (90%)

       via  55db20860c90c60dd231083c1cc81001c3fbe971 (commit)
       via  a6010696b246cc43c721a35d56179c629d54e798 (commit)
       via  6c91320c3a53ddbe1b81bee8ed6322c20acbd047 (commit)
       via  aa923c951ff68e7131628c449b5e2f8a74e2a21e (commit)
       via  965bf2a51134ed26fb23d2209d9a0d319fe05744 (commit)
       via  cd3a0b27c6985f72d58581d43c7cd6686a977f56 (commit)
       via  7f4ff745bfd5356e885e13b4984d93c5c5b204f1 (commit)
       via  774a5d8543e7335a8580c7c86f2436b96aff1024 (commit)
       via  872cf2e096a7b72722aa76040339a23e962cdf96 (commit)
       via  802fd8ec8df8a36029c76b20e472f78c09772950 (commit)
       via  b79c440b6af5cf4855e3fb3fb510b3eb8de1da9c (commit)
       via  fb0585a11cc10929cd8121ddd219855172754ef3 (commit)
      from  7ea5d6161e4eab67795fbe2758d98822a9a03f13 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 55db20860c90c60dd231083c1cc81001c3fbe971
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Aug 11 18:33:52 2014 -0400

    3570: Use exit code 75 to mean "temporary locking failure". Add comments.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index d5edf0be..e7cac18 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -84,6 +84,8 @@ use File::Temp;
 use Fcntl ':flock';
 use File::Path qw( make_path );
 
+use constant EX_TEMPFAIL => 75;
+
 $ENV{"TMPDIR"} ||= "/tmp";
 unless (defined $ENV{"CRUNCH_TMP"}) {
   $ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
@@ -151,24 +153,24 @@ if ($job_has_uuid)
   $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
   if (!$force_unlock) {
     # If some other crunch-job process has grabbed this job (or we see
-    # other evidence that the job is already underway) we exit 111 so
-    # crunch-dispatch (our parent process) doesn't mark the job as
-    # failed.
+    # other evidence that the job is already underway) we exit
+    # EX_TEMPFAIL so crunch-dispatch (our parent process) doesn't
+    # mark the job as failed.
     if ($Job->{'is_locked_by_uuid'}) {
-      Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'} . ", exiting 111");
-      exit(111);
+      Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'});
+      exit EX_TEMPFAIL;
     }
     if ($Job->{'success'} ne undef) {
       Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null");
-      exit(111);
+      exit EX_TEMPFAIL;
     }
     if ($Job->{'running'}) {
       Log(undef, "Job 'running' flag is already set");
-      exit(111);
+      exit EX_TEMPFAIL;
     }
     if ($Job->{'started_at'}) {
       Log(undef, "Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
-      exit(111);
+      exit EX_TEMPFAIL;
     }
   }
 }
@@ -281,8 +283,8 @@ if ($job_has_uuid)
   # Claim this job, and make sure nobody else does
   unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) &&
           $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
-    Log(undef, "Error while updating / locking job, exiting 111");
-    exit(111);
+    Log(undef, "Error while updating / locking job, exiting ".EX_TEMPFAIL);
+    exit EX_TEMPFAIL;
   }
   $Job->update_attributes('started_at' => scalar gmtime,
                           'running' => 1,
@@ -893,7 +895,7 @@ else {
 Log (undef, "finish");
 
 save_meta();
-exit 0;
+exit ($Job->{'success'} ? 1 : 0);
 
 
 
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 58e6645..154fcf3 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -379,7 +379,7 @@ class Dispatcher
     exit_status = j_done[:wait_thr].value
 
     jobrecord = Job.find_by_uuid(job_done.uuid)
-    if exit_status.to_i != 111 and jobrecord.started_at
+    if exit_status.to_i != 75 and jobrecord.started_at
       # Clean up state fields in case crunch-job exited without
       # putting the job in a suitable "finished" state.
       jobrecord.running = false
@@ -392,7 +392,18 @@ class Dispatcher
       # Don't fail the job if crunch-job didn't even get as far as
       # starting it. If the job failed to run due to an infrastructure
       # issue with crunch-job or slurm, we want the job to stay in the
-      # queue.
+      # queue. If crunch-job exited after losing a race to another
+      # crunch-job process, it exits 75 and we should leave the job
+      # record alone so the winner of the race do its thing.
+      #
+      # There is still an unhandled race condition: If our crunch-job
+      # process is about to lose a race with another crunch-job
+      # process, but crashes before getting to its "exit 75" (for
+      # example, "cannot fork" or "cannot reach API server") then we
+      # will assume incorrectly that it's our process's fault
+      # jobrecord.started_at is non-nil, and mark the job as failed
+      # even though the winner of the race is probably still doing
+      # fine.
     end
 
     # Invalidate the per-job auth token

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list