[ARVADOS] updated: 55db20860c90c60dd231083c1cc81001c3fbe971
git at public.curoverse.com
git at public.curoverse.com
Mon Aug 11 18:40:34 EDT 2014
Summary of changes:
apps/workbench/app/helpers/application_helper.rb | 2 +-
.../app/views/application/_choose.html.erb | 13 +-
.../test/integration/pipeline_instances_test.rb | 1 +
doc/_config.yml | 2 +-
...py.liquid => _concurrent_hash_script_py.liquid} | 0
.../topics/tutorial-job-debug.html.textile.liquid | 163 --------------------
.../topics/tutorial-parallel.html.textile.liquid | 22 +--
.../tutorial-firstscript.html.textile.liquid | 165 ++++++++-------------
...uid => tutorial-submit-job.html.textile.liquid} | 11 +-
sdk/cli/bin/crunch-job | 24 +--
services/api/script/crunch-dispatch.rb | 15 +-
11 files changed, 117 insertions(+), 301 deletions(-)
rename doc/_includes/{_parallel_hash_script_py.liquid => _concurrent_hash_script_py.liquid} (100%)
delete mode 100644 doc/user/topics/tutorial-job-debug.html.textile.liquid
copy doc/user/tutorials/{tutorial-firstscript.html.textile.liquid => tutorial-submit-job.html.textile.liquid} (90%)
via 55db20860c90c60dd231083c1cc81001c3fbe971 (commit)
via a6010696b246cc43c721a35d56179c629d54e798 (commit)
via 6c91320c3a53ddbe1b81bee8ed6322c20acbd047 (commit)
via aa923c951ff68e7131628c449b5e2f8a74e2a21e (commit)
via 965bf2a51134ed26fb23d2209d9a0d319fe05744 (commit)
via cd3a0b27c6985f72d58581d43c7cd6686a977f56 (commit)
via 7f4ff745bfd5356e885e13b4984d93c5c5b204f1 (commit)
via 774a5d8543e7335a8580c7c86f2436b96aff1024 (commit)
via 872cf2e096a7b72722aa76040339a23e962cdf96 (commit)
via 802fd8ec8df8a36029c76b20e472f78c09772950 (commit)
via b79c440b6af5cf4855e3fb3fb510b3eb8de1da9c (commit)
via fb0585a11cc10929cd8121ddd219855172754ef3 (commit)
from 7ea5d6161e4eab67795fbe2758d98822a9a03f13 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 55db20860c90c60dd231083c1cc81001c3fbe971
Author: Tom Clegg <tom at curoverse.com>
Date: Mon Aug 11 18:33:52 2014 -0400
3570: Use exit code 75 to mean "temporary locking failure". Add comments.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index d5edf0be..e7cac18 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -84,6 +84,8 @@ use File::Temp;
use Fcntl ':flock';
use File::Path qw( make_path );
+use constant EX_TEMPFAIL => 75;
+
$ENV{"TMPDIR"} ||= "/tmp";
unless (defined $ENV{"CRUNCH_TMP"}) {
$ENV{"CRUNCH_TMP"} = $ENV{"TMPDIR"} . "/crunch-job";
@@ -151,24 +153,24 @@ if ($job_has_uuid)
$Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
if (!$force_unlock) {
# If some other crunch-job process has grabbed this job (or we see
- # other evidence that the job is already underway) we exit 111 so
- # crunch-dispatch (our parent process) doesn't mark the job as
- # failed.
+ # other evidence that the job is already underway) we exit
+ # EX_TEMPFAIL so crunch-dispatch (our parent process) doesn't
+ # mark the job as failed.
if ($Job->{'is_locked_by_uuid'}) {
- Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'} . ", exiting 111");
- exit(111);
+ Log(undef, "Job is locked by " . $Job->{'is_locked_by_uuid'});
+ exit EX_TEMPFAIL;
}
if ($Job->{'success'} ne undef) {
Log(undef, "Job 'success' flag (" . $Job->{'success'} . ") is not null");
- exit(111);
+ exit EX_TEMPFAIL;
}
if ($Job->{'running'}) {
Log(undef, "Job 'running' flag is already set");
- exit(111);
+ exit EX_TEMPFAIL;
}
if ($Job->{'started_at'}) {
Log(undef, "Job 'started_at' time is already set (" . $Job->{'started_at'} . ")");
- exit(111);
+ exit EX_TEMPFAIL;
}
}
}
@@ -281,8 +283,8 @@ if ($job_has_uuid)
# Claim this job, and make sure nobody else does
unless ($Job->update_attributes('is_locked_by_uuid' => $User->{'uuid'}) &&
$Job->{'is_locked_by_uuid'} == $User->{'uuid'}) {
- Log(undef, "Error while updating / locking job, exiting 111");
- exit(111);
+ Log(undef, "Error while updating / locking job, exiting ".EX_TEMPFAIL);
+ exit EX_TEMPFAIL;
}
$Job->update_attributes('started_at' => scalar gmtime,
'running' => 1,
@@ -893,7 +895,7 @@ else {
Log (undef, "finish");
save_meta();
-exit 0;
+exit ($Job->{'success'} ? 1 : 0);
diff --git a/services/api/script/crunch-dispatch.rb b/services/api/script/crunch-dispatch.rb
index 58e6645..154fcf3 100755
--- a/services/api/script/crunch-dispatch.rb
+++ b/services/api/script/crunch-dispatch.rb
@@ -379,7 +379,7 @@ class Dispatcher
exit_status = j_done[:wait_thr].value
jobrecord = Job.find_by_uuid(job_done.uuid)
- if exit_status.to_i != 111 and jobrecord.started_at
+ if exit_status.to_i != 75 and jobrecord.started_at
# Clean up state fields in case crunch-job exited without
# putting the job in a suitable "finished" state.
jobrecord.running = false
@@ -392,7 +392,18 @@ class Dispatcher
# Don't fail the job if crunch-job didn't even get as far as
# starting it. If the job failed to run due to an infrastructure
# issue with crunch-job or slurm, we want the job to stay in the
- # queue.
+ # queue. If crunch-job exited after losing a race to another
+ # crunch-job process, it exits 75 and we should leave the job
+ # record alone so the winner of the race do its thing.
+ #
+ # There is still an unhandled race condition: If our crunch-job
+ # process is about to lose a race with another crunch-job
+ # process, but crashes before getting to its "exit 75" (for
+ # example, "cannot fork" or "cannot reach API server") then we
+ # will assume incorrectly that it's our process's fault
+ # jobrecord.started_at is non-nil, and mark the job as failed
+ # even though the winner of the race is probably still doing
+ # fine.
end
# Invalidate the per-job auth token
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list