[ARVADOS] updated: 5e0dd2c6b8f080c81ff6077e629f5ec9f377802f

Thu May 28 17:14:11 EDT 2015

Summary of changes:
 sdk/cli/bin/crunch-job | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

  discards  36dd29f886873ab89661e7574fb8eedd265be215 (commit)
       via  5e0dd2c6b8f080c81ff6077e629f5ec9f377802f (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (36dd29f886873ab89661e7574fb8eedd265be215)
            \
             N -- N -- N (5e0dd2c6b8f080c81ff6077e629f5ec9f377802f)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 5e0dd2c6b8f080c81ff6077e629f5ec9f377802f
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu May 28 17:13:44 2015 -0400

    6146: Exit TEMPFAIL early (without failing the job) if worker nodes cannot run a trivial command.
    
    This is meant to improve the way we handle a couple of edge cases.
    
    1. A worker node doesn't get bootstrapped properly. It works well
    enough to persuade nodemanager and the API server that it's alive and
    ready to run jobs, but it can't actually run jobs. This means there's
    a bug in the bootstrapping process -- its startup script shouldn't
    tell slurm State=RESUME without checking itself -- but even so this
    doesn't deserve to fail a job: it's definitely a system problem,
    there's zero chance a different job would have gone any differently.
    
    2. A worker node has a hardware problem, or it has fallen off the
    network, or something like that, but slurm hasn't yet noticed and set
    its state to DOWN, so slurm still uses it to satisfy crunch-dispatch's
    "salloc" commands. As above, there's zero chance this could have gone
    differently for any other job, so it doesn't make sense to fail the
    job.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 6cdaf90..5414033 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -125,12 +125,14 @@ my $jobspec;
 my $job_api_token;
 my $no_clear_tmp;
 my $resume_stash;
+my $docker_bin = "/usr/bin/docker.io";
 GetOptions('force-unlock' => \$force_unlock,
            'git-dir=s' => \$git_dir,
            'job=s' => \$jobspec,
            'job-api-token=s' => \$job_api_token,
            'no-clear-tmp' => \$no_clear_tmp,
            'resume-stash=s' => \$resume_stash,
+           'docker-bin=s' => \$docker_bin,
     );
 
 if (defined $job_api_token) {
@@ -138,7 +140,6 @@ if (defined $job_api_token) {
 }
 
 my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
-my $local_job = 0;
 
 
 $SIG{'USR1'} = sub
@@ -150,8 +151,6 @@ $SIG{'USR2'} = sub
   $main::ENV{CRUNCH_DEBUG} = 0;
 };
 
-
-
 my $arv = Arvados->new('apiVersion' => 'v1');
 
 my $Job;
@@ -160,12 +159,41 @@ my $dbh;
 my $sth;
 my @jobstep;
 
-my $User = api_call("users/current");
-
+my $local_job;
 if ($jobspec =~ /^[-a-z\d]+$/)
 {
   # $jobspec is an Arvados UUID, not a JSON job specification
   $Job = api_call("jobs/get", uuid => $jobspec);
+  $local_job = 0;
+}
+else
+{
+  $Job = JSON::decode_json($jobspec);
+  $local_job = 1;
+}
+
+
+# Make sure our workers (our slurm nodes, localhost, or whatever) are
+# at least able to run basic commands: they aren't down or severely
+# misconfigured.
+my $cmd = ['true'];
+if ($Job->{docker_image_locator}) {
+  $cmd = [$docker_bin, 'ps', '-q'];
+}
+Log(undef, "Sanity check is `@$cmd`");
+srun(["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
+     $cmd,
+     {fork => 1});
+if ($? != 0) {
+  Log(undef, "Sanity check failed: ".exit_status_s($?));
+  exit EX_TEMPFAIL;
+}
+Log(undef, "Sanity check OK");
+
+
+my $User = api_call("users/current");
+
+if (!$local_job) {
   if (!$force_unlock) {
     # Claim this job, and make sure nobody else does
     eval { api_call("jobs/lock", uuid => $Job->{uuid}); };
@@ -177,8 +205,6 @@ if ($jobspec =~ /^[-a-z\d]+$/)
 }
 else
 {
-  $Job = JSON::decode_json($jobspec);
-
   if (!$resume_stash)
   {
     map { croak ("No $_ specified") unless $Job->{$_} }
@@ -376,7 +402,6 @@ if (!defined $no_clear_tmp) {
 }
 
 # If this job requires a Docker image, install that.
-my $docker_bin = "/usr/bin/docker.io";
 my ($docker_locator, $docker_stream, $docker_hash, $docker_limitmem);
 if ($docker_locator = $Job->{docker_image_locator}) {
   ($docker_stream, $docker_hash) = find_docker_image($docker_locator);
@@ -1139,7 +1164,7 @@ sub reapchildren
 
     Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
                              ++$Jobstep->{'failures'},
-                             $temporary_fail ? 'temporary ' : 'permanent',
+                             $temporary_fail ? 'temporary' : 'permanent',
                              $elapsed));
 
     if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
@@ -1708,7 +1733,13 @@ sub srun
   my $show_cmd = Dumper($args);
   $show_cmd =~ s/(TOKEN\\*=)[^\s\']+/${1}[...]/g;
   $show_cmd =~ s/\n/ /g;
-  warn "starting: $show_cmd\n";
+  if ($opts->{fork}) {
+    Log(undef, "starting: $show_cmd");
+  } else {
+    # This is a child process: parent is in charge of reading our
+    # stderr and copying it to Log() if needed.
+    warn "starting: $show_cmd\n";
+  }
 
   if (defined $stdin) {
     my $child = open STDIN, "-|";

-----------------------------------------------------------------------


hooks/post-receive
--