[ARVADOS] created: faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c
Git user
git at public.curoverse.com
Tue Sep 13 08:52:53 EDT 2016
at faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c (commit)
commit faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Mon Sep 12 22:13:11 2016 -0400
9924: Mark all slots as "failed" on "Unable to confirm allocation" error.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index e0aff31..0ad5a6b 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1510,8 +1510,15 @@ sub preprocess_stderr
substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
Log ($jobstepidx, "stderr $line");
if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
- # whoa.
+ # If the allocation is revoked, we can't possibly continue, so mark all
+ # slots as failed. This will cause the overall exit code to be
+ # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
+ # this job. whoa.
$main::please_freeze = 1;
+ $working_slot_count = 0;
+ foreach my $st (@slot) {
+ $st->{node}->{fail_count}++;
+ }
}
elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list