[ARVADOS] updated: 34d92b238ebc107bf28dac5c7e3ce138ac84b2c1

Git user git at public.curoverse.com
Fri Apr 1 15:50:23 EDT 2016


Summary of changes:
 sdk/cli/bin/crunch-job | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

       via  34d92b238ebc107bf28dac5c7e3ce138ac84b2c1 (commit)
       via  86f774031fd38bd8d34341afd007fbea9e6da740 (commit)
      from  cafe405682669b7ebdec8db4ee083c3ca2761827 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 34d92b238ebc107bf28dac5c7e3ce138ac84b2c1
Merge: cafe405 86f7740
Author: Brett Smith <brett at curoverse.com>
Date:   Fri Apr 1 15:50:01 2016 -0400

    Merge branch '8811-srun-sync-tempfail-wip'
    
    Closes #8811, #8862.


commit 86f774031fd38bd8d34341afd007fbea9e6da740
Author: Brett Smith <brett at curoverse.com>
Date:   Thu Mar 31 17:46:51 2016 -0400

    8811: crunch-job srun_sync detects and reports SLURM tempfails.
    
    preprocess_stderr needed updating to check for these tempfails even in
    cases where the child process does not have a slotindex.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 86e018c..cc0b60c 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1459,6 +1459,9 @@ sub readfrompipes
 sub preprocess_stderr
 {
   my $jobstepidx = shift;
+  # slotindex is only defined for children running Arvados job tasks.
+  # Be prepared to handle the undef case (for setup srun calls, etc.).
+  my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
 
   while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
     my $line = $1;
@@ -1468,19 +1471,16 @@ sub preprocess_stderr
       # whoa.
       $main::please_freeze = 1;
     }
-    elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
-      # Skip the following tempfail checks if this srun proc isn't
-      # attached to a particular worker slot.
-    }
     elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
-      my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
-      $slot[$job_slot_index]->{node}->{fail_count}++;
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($job_slot_index);
+      if (defined($job_slot_index)) {
+        $slot[$job_slot_index]->{node}->{fail_count}++;
+        ban_node_by_slot($job_slot_index);
+      }
     }
     elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
-      ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+      ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
     }
     elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
       $jobstep[$jobstepidx]->{tempfail} = 1;
@@ -1970,6 +1970,11 @@ sub srun_sync
   delete $reader{$jobstepidx};
 
   my $j = pop @jobstep;
+  # If the srun showed signs of tempfail, ensure the caller treats that as a
+  # failure case.
+  if ($main::please_freeze || $j->{tempfail}) {
+    $exited ||= 255;
+  }
   return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
 }
 

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list