[ARVADOS] created: 7ea5d6161e4eab67795fbe2758d98822a9a03f13

git at public.curoverse.com git at public.curoverse.com
Mon Aug 11 13:26:00 EDT 2014


        at  7ea5d6161e4eab67795fbe2758d98822a9a03f13 (commit)


commit 7ea5d6161e4eab67795fbe2758d98822a9a03f13
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Aug 11 13:25:48 2014 -0400

    3570: Do not fail the job when crunch-job loses a locking race during startup.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 06b3da9..d5edf0be 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -150,17 +150,25 @@ if ($job_has_uuid)
 {
   $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
   if (!$force_unlock) {
+    # If some other crunch-job process has grabbed this job (or we see
+    # other evidence that the job is already underway) we exit 111 so
+    # crunch-dispatch (our parent process) doesn't mark the job as
+    # failed.
     if ($Job->{'is_locked_by_uuid'}) {
-      croak("Job is locked: " . $Job->{'is_locked_by_uuid'});
+      Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'} . ", exiting 111");
+      exit(111);
     }
     if ($Job->{'success'} ne undef) {
-      croak("Job 'success' flag (" . $Job->{'success'} . ") is not null");
+      Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null");
+      exit(111);
     }
     if ($Job->{'running'}) {
-      croak("Job 'running' flag is already set");
+      Log(undef, "Job 'running' flag is already set");
+      exit(111);
     }
     if ($Job->{'started_at'}) {
-      croak("Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
+      Log(undef, "Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
+      exit(111);
     }
   }
 }
@@ -273,7 +281,8 @@ if ($job_has_uuid)
   # Claim this job, and make sure nobody else does
   unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) &&
           $Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
-    croak("Error while updating / locking job");
+    Log(undef, "Error while updating / locking job, exiting 111");
+    exit(111);
   }
   $Job->update_attributes('started_at' => scalar gmtime,
                           'running' => 1,
@@ -688,7 +697,9 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
 
     my @execargs = ('bash', '-c', $command);
     srun (\@srunargs, \@execargs, undef, $build_script_to_send);
-    exit (111);
+    # exec() failed, we assume nothing happened.
+    Log(undef, "srun() failed on build script");
+    die;
   }
   close("writer");
   if (!defined $childpid)
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 5a990f0..58e6645 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -375,11 +375,11 @@ class Dispatcher
       $stderr.puts j_done[:stderr_buf] + "\n"
     end
 
-    # Wait the thread
-    j_done[:wait_thr].value
+    # Wait the thread (returns a Process::Status)
+    exit_status = j_done[:wait_thr].value
 
     jobrecord = Job.find_by_uuid(job_done.uuid)
-    if jobrecord.started_at
+    if exit_status.to_i != 111 and jobrecord.started_at
       # Clean up state fields in case crunch-job exited without
       # putting the job in a suitable "finished" state.
       jobrecord.running = false

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list