[ARVADOS] created: faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c

Git user git at public.curoverse.com
Tue Sep 13 08:52:53 EDT 2016


        at  faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c (commit)


commit faf5328304cd7c0b3ec1669c6f7f18c3f4daf25c
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Mon Sep 12 22:13:11 2016 -0400

    9924: Mark all slots as "failed" on "Unable to confirm allocation" error.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index e0aff31..0ad5a6b 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1510,8 +1510,15 @@ sub preprocess_stderr
     substr $jobstep[$jobstepidx]->{stderr}, 0, 1+length($line), "";
     Log ($jobstepidx, "stderr $line");
     if ($line =~ /srun: error: (SLURM job $ENV{SLURM_JOB_ID} has expired|Unable to confirm allocation for job $ENV{SLURM_JOB_ID})/) {
-      # whoa.
+      # If the allocation is revoked, we can't possibly continue, so mark all
+      # slots as failed.  This will cause the overall exit code to be
+      # EX_RETRY_UNLOCKED instead of failure so that crunch_dispatch can re-run
+      # this job.  whoa.
       $main::please_freeze = 1;
+      $working_slot_count = 0;
+      foreach my $st (@slot) {
+        $st->{node}->{fail_count}++;
+      }
     }
     elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list