[ARVADOS] updated: fbd54468b13466839c24d880a3d041d0a49371af

Mon Feb 29 16:00:46 EST 2016

Summary of changes:
 sdk/cli/bin/crunch-job | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

  discards  e185fca1249abda3de3ac9237218bb0c8560d2ab (commit)
       via  fbd54468b13466839c24d880a3d041d0a49371af (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (e185fca1249abda3de3ac9237218bb0c8560d2ab)
            \
             N -- N -- N (fbd54468b13466839c24d880a3d041d0a49371af)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit fbd54468b13466839c24d880a3d041d0a49371af
Merge: ad48bb3 3c90228
Author: Tom Clegg <tom at curoverse.com>
Date:   Mon Feb 29 16:00:39 2016 -0500

    8099: 7263: Merge branch 'hgi/7263-even-better-busy-behavior' of github.com:wtsi-hgi/arvados into 8099-babysit-all-srun
    
    Conflicts:
    	sdk/cli/bin/crunch-job

diff --cc sdk/cli/bin/crunch-job
index b63886e,c8a1de9..ca9db1d

--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@@ -1130,122 -1181,133 +1131,133 @@@ sub update_progress_stat
  
  sub reapchildren
  {
-   my $pid = waitpid (-1, WNOHANG);
-   return 0 if $pid <= 0;
- 
-   my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
- 		  . "."
- 		  . $slot[$proc{$pid}->{slot}]->{cpu});
-   my $jobstepidx = $proc{$pid}->{jobstepidx};
-   my $elapsed = time - $proc{$pid}->{time};
-   my $Jobstep = $jobstep[$jobstepidx];
- 
-   my $childstatus = $?;
-   my $exitvalue = $childstatus >> 8;
-   my $exitinfo = "exit ".exit_status_s($childstatus);
-   $Jobstep->{'arvados_task'}->reload;
-   my $task_success = $Jobstep->{'arvados_task'}->{success};
- 
-   Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
- 
-   if (!defined $task_success) {
-     # task did not indicate one way or the other --> fail
-     Log($jobstepidx, sprintf(
-           "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
-           exit_status_s($childstatus)));
-     $Jobstep->{'arvados_task'}->{success} = 0;
-     $Jobstep->{'arvados_task'}->save;
-     $task_success = 0;
-   }
- 
-   if (!$task_success)
+   my $children_reaped = 0;
 -
 -  while((my $pid = waitpid (-1, WNOHANG)) > 0)
++  while ((my $pid = waitpid (-1, WNOHANG)) > 0)
    {
-     my $temporary_fail;
-     $temporary_fail ||= $Jobstep->{tempfail};
-     $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
- 
-     ++$thisround_failed;
-     ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
- 
-     # Check for signs of a failed or misconfigured node
-     if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
- 	2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
-       # Don't count this against jobstep failure thresholds if this
-       # node is already suspected faulty and srun exited quickly
-       if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
- 	  $elapsed < 5) {
- 	Log ($jobstepidx, "blaming failure on suspect node " .
-              $slot[$proc{$pid}->{slot}]->{node}->{name});
-         $temporary_fail ||= 1;
-       }
-       ban_node_by_slot($proc{$pid}->{slot});
+     my $childstatus = $?;
++
+     my $whatslot = ($slot[$proc{$pid}->{slot}]->{node}->{name}
+                     . "."
+                     . $slot[$proc{$pid}->{slot}]->{cpu});
 -    my $jobstepid = $proc{$pid}->{jobstep};
++    my $jobstepidx = $proc{$pid}->{jobstepidx};
+ 
+     if (!WIFEXITED($childstatus))
+     {
+       # child did not exit (may be temporarily stopped)
 -      Log ($jobstepid, "child $pid did not actually exit in reapchildren, ignoring for now.");
++      Log ($jobstepidx, "child $pid did not actually exit in reapchildren, ignoring for now.");
+       next;
      }
  
-     Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
-                              ++$Jobstep->{'failures'},
-                              $temporary_fail ? 'temporary' : 'permanent',
-                              $elapsed));
+     $children_reaped++;
+     my $elapsed = time - $proc{$pid}->{time};
 -    my $Jobstep = $jobstep[$jobstepid];
++    my $Jobstep = $jobstep[$jobstepidx];
+ 
+     my $exitvalue = $childstatus >> 8;
+     my $exitinfo = "exit ".exit_status_s($childstatus);
+     $Jobstep->{'arvados_task'}->reload;
+     my $task_success = $Jobstep->{'arvados_task'}->{success};
+ 
 -    Log ($jobstepid, "child $pid on $whatslot $exitinfo success=$task_success");
++    Log ($jobstepidx, "child $pid on $whatslot $exitinfo success=$task_success");
+ 
+     if (!defined $task_success) {
+       # task did not indicate one way or the other --> fail
 -      Log($jobstepid, sprintf(
++      Log($jobstepidx, sprintf(
+             "ERROR: Task process exited %s, but never updated its task record to indicate success and record its output.",
+             exit_status_s($childstatus)));
+       $Jobstep->{'arvados_task'}->{success} = 0;
+       $Jobstep->{'arvados_task'}->save;
+       $task_success = 0;
+     }
  
-     if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
-       # Give up on this task, and the whole job
-       $main::success = 0;
+     if (!$task_success)
+     {
+       my $temporary_fail;
+       $temporary_fail ||= $Jobstep->{tempfail};
+       $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
+ 
+       ++$thisround_failed;
+       ++$thisround_failed_multiple if $Jobstep->{'failures'} >= 1;
+ 
+       # Check for signs of a failed or misconfigured node
+       if (++$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} >=
+           2+$slot[$proc{$pid}->{slot}]->{node}->{ncpus}) {
+         # Don't count this against jobstep failure thresholds if this
+         # node is already suspected faulty and srun exited quickly
+         if ($slot[$proc{$pid}->{slot}]->{node}->{hold_until} &&
+             $elapsed < 5) {
 -          Log ($jobstepid, "blaming failure on suspect node " .
++          Log ($jobstepidx, "blaming failure on suspect node " .
+                $slot[$proc{$pid}->{slot}]->{node}->{name});
+           $temporary_fail ||= 1;
+         }
+         ban_node_by_slot($proc{$pid}->{slot});
+       }
+ 
 -      Log ($jobstepid, sprintf('failure (#%d, %s) after %d seconds',
 -                               ++$Jobstep->{'failures'},
 -                               $temporary_fail ? 'temporary' : 'permanent',
 -                               $elapsed));
++      Log ($jobstepidx, sprintf('failure (#%d, %s) after %d seconds',
++                                ++$Jobstep->{'failures'},
++                                $temporary_fail ? 'temporary' : 'permanent',
++                                $elapsed));
+ 
+       if (!$temporary_fail || $Jobstep->{'failures'} >= 3) {
+         # Give up on this task, and the whole job
+         $main::success = 0;
+       }
+       # Put this task back on the todo queue
 -      push @jobstep_todo, $jobstepid;
++      push @jobstep_todo, $jobstepidx;
+       $Job->{'tasks_summary'}->{'failed'}++;
      }
-     # Put this task back on the todo queue
-     push @jobstep_todo, $jobstepidx;
-     $Job->{'tasks_summary'}->{'failed'}++;
-   }
-   else
-   {
-     ++$thisround_succeeded;
-     $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
-     $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
-     $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
-     push @jobstep_done, $jobstepidx;
-     Log ($jobstepidx, "success in $elapsed seconds");
-   }
-   $Jobstep->{exitcode} = $childstatus;
-   $Jobstep->{finishtime} = time;
-   $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
-   $Jobstep->{'arvados_task'}->save;
-   process_stderr_final ($jobstepidx);
-   Log ($jobstepidx, sprintf("task output (%d bytes): %s",
-                            length($Jobstep->{'arvados_task'}->{output}),
-                            $Jobstep->{'arvados_task'}->{output}));
- 
-   close $reader{$jobstepidx};
-   delete $reader{$jobstepidx};
-   delete $slot[$proc{$pid}->{slot}]->{pid};
-   push @freeslot, $proc{$pid}->{slot};
-   delete $proc{$pid};
- 
-   if ($task_success) {
-     # Load new tasks
-     my $newtask_list = [];
-     my $newtask_results;
-     do {
-       $newtask_results = api_call(
-         "job_tasks/list",
-         'where' => {
-           'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
-         },
-         'order' => 'qsequence',
-         'offset' => scalar(@$newtask_list),
-       );
-       push(@$newtask_list, @{$newtask_results->{items}});
-     } while (@{$newtask_results->{items}});
-     foreach my $arvados_task (@$newtask_list) {
-       my $jobstep = {
-         'level' => $arvados_task->{'sequence'},
-         'failures' => 0,
-         'arvados_task' => $arvados_task
-       };
-       push @jobstep, $jobstep;
-       push @jobstep_todo, $#jobstep;
+     else
+     {
+       ++$thisround_succeeded;
+       $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
+       $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
+       $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
 -      push @jobstep_done, $jobstepid;
 -      Log ($jobstepid, "success in $elapsed seconds");
++      push @jobstep_done, $jobstepidx;
++      Log ($jobstepidx, "success in $elapsed seconds");
      }
+     $Jobstep->{exitcode} = $childstatus;
+     $Jobstep->{finishtime} = time;
+     $Jobstep->{'arvados_task'}->{finished_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{finishtime});
+     $Jobstep->{'arvados_task'}->save;
 -    process_stderr ($jobstepid, $task_success);
 -    Log ($jobstepid, sprintf("task output (%d bytes): %s",
 -                             length($Jobstep->{'arvados_task'}->{output}),
 -                             $Jobstep->{'arvados_task'}->{output}));
++    process_stderr_final ($jobstepidx);
++    Log ($jobstepidx, sprintf("task output (%d bytes): %s",
++                              length($Jobstep->{'arvados_task'}->{output}),
++                              $Jobstep->{'arvados_task'}->{output}));
+ 
 -    close $reader{$jobstepid};
 -    delete $reader{$jobstepid};
++    close $reader{$jobstepidx};
++    delete $reader{$jobstepidx};
+     delete $slot[$proc{$pid}->{slot}]->{pid};
+     push @freeslot, $proc{$pid}->{slot};
+     delete $proc{$pid};
+ 
+     if ($task_success) {
+       # Load new tasks
+       my $newtask_list = [];
+       my $newtask_results;
+       do {
+         $newtask_results = api_call(
+           "job_tasks/list",
+           'where' => {
+             'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
+           },
+           'order' => 'qsequence',
+           'offset' => scalar(@$newtask_list),
+             );
+         push(@$newtask_list, @{$newtask_results->{items}});
+       } while (@{$newtask_results->{items}});
+       foreach my $arvados_task (@$newtask_list) {
+         my $jobstep = {
+           'level' => $arvados_task->{'sequence'},
+           'failures' => 0,
+           'arvados_task' => $arvados_task
+         };
+         push @jobstep, $jobstep;
+         push @jobstep_todo, $#jobstep;
+       }
+     }
+     $progress_is_dirty = 1;
    }
  
-   $progress_is_dirty = 1;
-   1;
+   return $children_reaped;
  }
  
  sub check_refresh_wanted
@@@ -1292,10 -1351,13 +1304,13 @@@ sub check_squeu
    # squeue check interval (15s) this should make the squeue check an
    # infrequent event.
    my $silent_procs = 0;
-   for my $procinfo (values %proc)
 -  for my $js (map {$jobstep[$_->{jobstep}]} values %proc)
++  for my $js (map {$jobstep[$_->{jobstepidx}]} values %proc)
    {
-     my $jobstep = $jobstep[$procinfo->{jobstepidx}];
-     if ($jobstep->{stderr_at} < $last_squeue_check)
+     if (!exists($js->{stderr_at}))
+     {
+       $js->{stderr_at} = 0;
+     }
+     if ($js->{stderr_at} < $last_squeue_check)
      {
        $silent_procs++;
      }
@@@ -1305,16 -1367,16 +1320,16 @@@
    # use killem() on procs whose killtime is reached
    while (my ($pid, $procinfo) = each %proc)
    {
-     my $jobstep = $jobstep[$procinfo->{jobstepidx}];
 -    my $js = $jobstep[$procinfo->{jobstep}];
++    my $js = $jobstep[$procinfo->{jobstepidx}];
      if (exists $procinfo->{killtime}
          && $procinfo->{killtime} <= time
-         && $jobstep->{stderr_at} < $last_squeue_check)
+         && $js->{stderr_at} < $last_squeue_check)
      {
        my $sincewhen = "";
-       if ($jobstep->{stderr_at}) {
-         $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
+       if ($js->{stderr_at}) {
+         $sincewhen = " in last " . (time - $js->{stderr_at}) . "s";
        }
 -      Log($procinfo->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
 +      Log($procinfo->{jobstepidx}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
        killem ($pid);
      }
    }
@@@ -1383,32 -1446,30 +1398,42 @@@ sub release_allocatio
  sub readfrompipes
  {
    my $gotsome = 0;
+   my %fd_job;
+   my $sel = IO::Select->new();
 -  foreach my $job (keys %reader)
 +  foreach my $jobstepidx (keys %reader)
    {
 -    my $fd = $reader{$job};
++    my $fd = $reader{$jobstepidx};
+     $sel->add($fd);
 -    $fd_job{$fd} = $job;
++    $fd_job{$fd} = $jobstepidx;
++
++    if (my $stdout_fd = $jobstep[$jobstepidx]->{stdout_r}) {
++      $sel->add($stdout_fd);
++      $fd_job{$stdout_fd} = $jobstepidx;
++    }
+   }
+   # select on all reader fds with 0.1s timeout
+   my @ready_fds = $sel->can_read(0.1);
+   foreach my $fd (@ready_fds)
+   {
      my $buf;
-     if ($jobstep[$jobstepidx]->{stdout_r} &&
-         0 < sysread ($jobstep[$jobstepidx]->{stdout_r}, $buf, 65536))
+     if (0 < sysread ($fd, $buf, 65536))
      {
++      $gotsome = 1;
        print STDERR $buf if $ENV{CRUNCH_DEBUG};
-       if (exists $jobstep[$jobstepidx]->{stdout_captured}) {
 -      my $job = $fd_job{$fd};
 -      $jobstep[$job]->{stderr_at} = time;
 -      $jobstep[$job]->{stderr} .= $buf;
++
++      my $jobstepidx = $fd_job{$fd};
++      if ($jobstep[$jobstepidx]->{stdout_r} == $fd) {
 +        $jobstep[$jobstepidx]->{stdout_captured} .= $buf;
++        next;
 +      }
-       $gotsome = 1;
-     }
-     if (0 < sysread ($reader{$jobstepidx}, $buf, 65536))
-     {
-       print STDERR $buf if $ENV{CRUNCH_DEBUG};
++
 +      $jobstep[$jobstepidx]->{stderr_at} = time;
 +      $jobstep[$jobstepidx]->{stderr} .= $buf;
-       if (exists $jobstep[$jobstepidx]->{stderr_captured}) {
-         $jobstep[$jobstepidx]->{stderr_captured} .= $buf;
-       }
-       $gotsome = 1;
  
        # Consume everything up to the last \n
 -      preprocess_stderr ($job);
 +      preprocess_stderr ($jobstepidx);
  
 -      if (length ($jobstep[$job]->{stderr}) > 16384)
 +      if (length ($jobstep[$jobstepidx]->{stderr}) > 16384)
        {
          # If we get a lot of stderr without a newline, chop off the
          # front to avoid letting our buffer grow indefinitely.

-----------------------------------------------------------------------


hooks/post-receive
--