[ARVADOS] updated: 34d92b238ebc107bf28dac5c7e3ce138ac84b2c1
Git user
git at public.curoverse.com
Fri Apr 1 15:50:23 EDT 2016
Summary of changes:
sdk/cli/bin/crunch-job | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
via 34d92b238ebc107bf28dac5c7e3ce138ac84b2c1 (commit)
via 86f774031fd38bd8d34341afd007fbea9e6da740 (commit)
from cafe405682669b7ebdec8db4ee083c3ca2761827 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 34d92b238ebc107bf28dac5c7e3ce138ac84b2c1
Merge: cafe405 86f7740
Author: Brett Smith <brett at curoverse.com>
Date: Fri Apr 1 15:50:01 2016 -0400
Merge branch '8811-srun-sync-tempfail-wip'
Closes #8811, #8862.
commit 86f774031fd38bd8d34341afd007fbea9e6da740
Author: Brett Smith <brett at curoverse.com>
Date: Thu Mar 31 17:46:51 2016 -0400
8811: crunch-job srun_sync detects and reports SLURM tempfails.
preprocess_stderr needed updating to check for these tempfails even in
cases where the child process does not have a slotindex.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 86e018c..cc0b60c 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1459,6 +1459,9 @@ sub readfrompipes
sub preprocess_stderr
{
my $jobstepidx = shift;
+ # slotindex is only defined for children running Arvados job tasks.
+ # Be prepared to handle the undef case (for setup srun calls, etc.).
+ my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
while ($jobstep[$jobstepidx]->{stderr} =~ /^(.*?)\n/) {
my $line = $1;
@@ -1468,19 +1471,16 @@ sub preprocess_stderr
# whoa.
$main::please_freeze = 1;
}
- elsif (!exists $jobstep[$jobstepidx]->{slotindex}) {
- # Skip the following tempfail checks if this srun proc isn't
- # attached to a particular worker slot.
- }
elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
- my $job_slot_index = $jobstep[$jobstepidx]->{slotindex};
- $slot[$job_slot_index]->{node}->{fail_count}++;
$jobstep[$jobstepidx]->{tempfail} = 1;
- ban_node_by_slot($job_slot_index);
+ if (defined($job_slot_index)) {
+ $slot[$job_slot_index]->{node}->{fail_count}++;
+ ban_node_by_slot($job_slot_index);
+ }
}
elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
- ban_node_by_slot($jobstep[$jobstepidx]->{slotindex});
+ ban_node_by_slot($job_slot_index) if (defined($job_slot_index));
}
elsif ($line =~ /\bKeep(Read|Write|Request)Error:/) {
$jobstep[$jobstepidx]->{tempfail} = 1;
@@ -1970,6 +1970,11 @@ sub srun_sync
delete $reader{$jobstepidx};
my $j = pop @jobstep;
+ # If the srun showed signs of tempfail, ensure the caller treats that as a
+ # failure case.
+ if ($main::please_freeze || $j->{tempfail}) {
+ $exited ||= 255;
+ }
return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list