[ARVADOS] created: 7ea5d6161e4eab67795fbe2758d98822a9a03f13
git at public.curoverse.com
git at public.curoverse.com
Mon Aug 11 13:26:00 EDT 2014
at 7ea5d6161e4eab67795fbe2758d98822a9a03f13 (commit)
commit 7ea5d6161e4eab67795fbe2758d98822a9a03f13
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Aug 11 13:25:48 2014 -0400
3570: Do not fail the job when crunch-job loses a locking race during startup.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 06b3da9..d5edf0be 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -150,17 +150,25 @@ if ($job_has_uuid)
{
$Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
if (!$force_unlock) {
+ # If some other crunch-job process has grabbed this job (or we see
+ # other evidence that the job is already underway) we exit 111 so
+ # crunch-dispatch (our parent process) doesn't mark the job as
+ # failed.
if ($Job->{'is_locked_by_uuid'}) {
- croak("Job is locked: " . $Job->{'is_locked_by_uuid'});
+ Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'} . ", exiting 111");
+ exit(111);
}
if ($Job->{'success'} ne undef) {
- croak("Job 'success' flag (" . $Job->{'success'} . ") is not null");
+ Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null");
+ exit(111);
}
if ($Job->{'running'}) {
- croak("Job 'running' flag is already set");
+ Log(undef, "Job 'running' flag is already set");
+ exit(111);
}
if ($Job->{'started_at'}) {
- croak("Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
+ Log(undef, "Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
+ exit(111);
}
}
}
@@ -273,7 +281,8 @@ if ($job_has_uuid)
# Claim this job, and make sure nobody else does
unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) &&
$Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
- croak("Error while updating / locking job");
+ Log(undef, "Error while updating / locking job, exiting 111");
+ exit(111);
}
$Job->update_attributes('started_at' => scalar gmtime,
'running' => 1,
@@ -688,7 +697,9 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
my @execargs = ('bash', '-c', $command);
srun (\@srunargs, \@execargs, undef, $build_script_to_send);
- exit (111);
+ # exec() failed, we assume nothing happened.
+ Log(undef, "srun() failed on build script");
+ die;
}
close("writer");
if (!defined $childpid)
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 5a990f0..58e6645 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -375,11 +375,11 @@ class Dispatcher
$stderr.puts j_done[:stderr_buf] + "\n"
end
- # Wait the thread
- j_done[:wait_thr].value
+ # Wait the thread (returns a Process::Status)
+ exit_status = j_done[:wait_thr].value
jobrecord = Job.find_by_uuid(job_done.uuid)
- if jobrecord.started_at
+ if exit_status.to_i != 111 and jobrecord.started_at
# Clean up state fields in case crunch-job exited without
# putting the job in a suitable "finished" state.
jobrecord.running = false
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list