[ARVADOS] updated: 1b5c30eb957594e00a09df745df7630f661e3807
git at public.curoverse.com
git at public.curoverse.com
Wed May 27 15:49:20 EDT 2015
Summary of changes:
sdk/cli/bin/crunch-job | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
discards a73f5150ef0547b7e151deed116be22bdc703842 (commit)
via 1b5c30eb957594e00a09df745df7630f661e3807 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (a73f5150ef0547b7e151deed116be22bdc703842)
\
N -- N -- N (1b5c30eb957594e00a09df745df7630f661e3807)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 1b5c30eb957594e00a09df745df7630f661e3807
Author: Tom Clegg <tom at curoverse.com>
Date: Wed May 27 15:48:54 2015 -0400
6146: Retry install (max 3 attempts) if install script fails with no error messages.
Also: if install fails, croak() instead of exit(1) so we still get a log file.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index c748904..6cdaf90 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -118,6 +118,7 @@ $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
$ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
mkdir ($ENV{"JOB_WORK"});
+my %proc;
my $force_unlock;
my $git_dir;
my $jobspec;
@@ -589,56 +590,89 @@ if (!defined $git_archive) {
}
}
else {
- Log(undef, "Run install script on all workers");
-
- my @srunargs = ("srun",
- "--nodelist=$nodelist",
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
- my @execargs = ("sh", "-c",
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
-
- $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
- my ($install_stderr_r, $install_stderr_w);
- pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
- set_nonblocking($install_stderr_r);
- my $installpid = fork();
- if ($installpid == 0)
- {
- close($install_stderr_r);
- fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
- open(STDOUT, ">&", $install_stderr_w);
- open(STDERR, ">&", $install_stderr_w);
- srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
- exit (1);
- }
- close($install_stderr_w);
- my $stderr_buf = '';
- while ($installpid != waitpid(-1, WNOHANG)) {
- freeze_if_want_freeze ($installpid);
- # Wait up to 0.1 seconds for something to appear on stderr, then
- # do a non-blocking read.
- my $bits = fhbits($install_stderr_r);
- select ($bits, undef, $bits, 0.1);
- if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
+ my $install_exited;
+ my $install_script_tries_left = 3;
+ for (my $attempts = 0; $attempts < 3; $attempts++) {
+ Log(undef, "Run install script on all workers");
+
+ my @srunargs = ("srun",
+ "--nodelist=$nodelist",
+ "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
+ my @execargs = ("sh", "-c",
+ "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
+
+ $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
+ my ($install_stderr_r, $install_stderr_w);
+ pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
+ set_nonblocking($install_stderr_r);
+ my $installpid = fork();
+ if ($installpid == 0)
{
- while ($stderr_buf =~ /^(.*?)\n/) {
- my $line = $1;
- substr $stderr_buf, 0, 1+length($line), "";
- Log(undef, "stderr $line");
+ close($install_stderr_r);
+ fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
+ open(STDOUT, ">&", $install_stderr_w);
+ open(STDERR, ">&", $install_stderr_w);
+ srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
+ exit (1);
+ }
+ close($install_stderr_w);
+ # Tell freeze_if_want_freeze how to kill the child, otherwise the
+ # "waitpid(installpid)" loop won't get interrupted by a freeze:
+ $proc{$installpid} = {};
+ my $stderr_buf = '';
+ # Track whether anything appears on stderr other than slurm errors
+ # ("srun: ...") and the "starting: ..." message printed by the
+ # srun subroutine itself:
+ my $stderr_anything_from_script = 0;
+ my $match_our_own_errors = '^(srun: error: |starting: \[)';
+ while ($installpid != waitpid(-1, WNOHANG)) {
+ freeze_if_want_freeze ($installpid);
+ # Wait up to 0.1 seconds for something to appear on stderr, then
+ # do a non-blocking read.
+ my $bits = fhbits($install_stderr_r);
+ select ($bits, undef, $bits, 0.1);
+ if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
+ {
+ while ($stderr_buf =~ /^(.*?)\n/) {
+ my $line = $1;
+ substr $stderr_buf, 0, 1+length($line), "";
+ Log(undef, "stderr $line");
+ if ($line !~ /$match_our_own_errors/) {
+ $stderr_anything_from_script = 1;
+ }
+ }
}
}
- }
- my $install_exited = $?;
- close($install_stderr_r);
- if (length($stderr_buf) > 0) {
- Log(undef, "stderr $stderr_buf")
+ delete $proc{$installpid};
+ $install_exited = $?;
+ close($install_stderr_r);
+ if (length($stderr_buf) > 0) {
+ if ($stderr_buf !~ /$match_our_own_errors/) {
+ $stderr_anything_from_script = 1;
+ }
+ Log(undef, "stderr $stderr_buf")
+ }
+
+ Log (undef, "Install script exited ".exit_status_s($install_exited));
+ last if $install_exited == 0 || $main::please_freeze;
+ # If the install script fails but doesn't print an error message,
+ # the next thing anyone is likely to do is just run it again in
+ # case it was a transient problem like "slurm communication fails
+ # because the network isn't reliable enough". So we'll just do
+ # that ourselves (up to 3 attempts in total). OTOH, if there is an
+ # error message, the problem is more likely to have a real fix and
+ # we should fail the job so the fixing process can start, instead
+ # of doing 2 more attempts.
+ last if $stderr_anything_from_script;
}
- Log (undef, "Install script exited ".exit_status_s($install_exited));
foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
unlink($tar_filename);
}
- exit (1) if $install_exited != 0;
+
+ if ($install_exited != 0) {
+ croak("Giving up");
+ }
}
foreach (qw (script script_version script_parameters runtime_constraints))
@@ -704,7 +738,6 @@ for (my $ii = $#freeslot; $ii >= 0; $ii--) {
}
Log(undef, "start level $level with $round_num_freeslots slots");
-my %proc;
my @holdslot;
my %reader;
my $progress_is_dirty = 1;
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list