[ARVADOS] created: 260442aac2e9f1ef6ae913a8c9e69ed9caa2e04c
Git user
git at public.curoverse.com
Fri Nov 18 11:43:04 EST 2016
at 260442aac2e9f1ef6ae913a8c9e69ed9caa2e04c (commit)
commit 260442aac2e9f1ef6ae913a8c9e69ed9caa2e04c
Author: Tom Clegg <tom at curoverse.com>
Date: Fri Nov 18 11:41:27 2016 -0500
10470: Recognize more slurm error messages.
Example from slurm 14.03.9:
srun: error: _server_read: fd 12 got error or unexpected eof reading header
srun: error: step_launch_notify_io_failure: aborting, io error with slurmstepd on node 0
srun: Job step aborted: Waiting up to 2 seconds for job step to finish.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index be14be9..8bebba4 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1519,7 +1519,7 @@ sub preprocess_stderr
$st->{node}->{fail_count}++;
}
}
- elsif ($line =~ /srun: error: (Node failure on|Aborting, .*\bio error\b)/) {
+ elsif ($line =~ /srun: error: .*\b(Node failure on|.*Aborting, .*\bio error\b)/i) {
$jobstep[$jobstepidx]->{tempfail} = 1;
if (defined($job_slot_index)) {
$slot[$job_slot_index]->{node}->{fail_count}++;
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list