[ARVADOS] updated: a1b5da5e536e8bfc58187965d11312d1fe883972

git at public.curoverse.com git at public.curoverse.com
Mon Jun 1 09:54:53 EDT 2015


Summary of changes:
 sdk/cli/bin/crunch-job | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

  discards  fc6473066977e983cddfc5455dde2d7816cc3d27 (commit)
       via  a1b5da5e536e8bfc58187965d11312d1fe883972 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (fc6473066977e983cddfc5455dde2d7816cc3d27)
            \
             N -- N -- N (a1b5da5e536e8bfc58187965d11312d1fe883972)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit a1b5da5e536e8bfc58187965d11312d1fe883972
Author: Tom Clegg <tom at curoverse.com>
Date:   Sun May 31 02:02:19 2015 -0400

    6146: Improvements to "kill srun process if slurm task disappears" feature:
    
    * Log when we notice a process is orphaned.
    
    * Log when we decide to kill an orphaned process.
    
    * Use `squeue --jobs $SLURM_JOBID` so slurm doesn't have to tell us
      about other jobs' tasks.
    
    * Do not kill a process that is still reporting stderr.
    
    * Do not check `squeue` at all if every process has reported stderr
      since the last squeue check. (In such cases, it seems safe to assume
      no children are hung/dead.)
    
    * Use the same timer/interval (15 seconds) for both noticing and
      killing orphaned processes.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 76eb95d..c536da6 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
 my @jobstep_done = ();
 my @jobstep_tomerge = ();
 my $jobstep_tomerge_level = 0;
-my $squeue_checked;
-my $squeue_kill_checked;
+my $squeue_checked = 0;
 my $latest_refresh = scalar time;
 
 
@@ -1254,29 +1253,41 @@ sub check_refresh_wanted
 
 sub check_squeue
 {
-  # return if the kill list was checked <4 seconds ago
-  if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
-  {
-    return;
-  }
-  $squeue_kill_checked = time;
+  my $last_squeue_check = $squeue_checked;
 
-  # use killem() on procs whose killtime is reached
-  for (keys %proc)
+  # Do not call `squeue` or check the kill list more than once every
+  # 15 seconds.
+  return if $last_squeue_check > time - 15;
+  $squeue_checked = time;
+
+  # Look for children from which we haven't received stderr data since
+  # the last squeue check. If no such children exist, all procs are
+  # alive and there's no need to even look at squeue.
+  #
+  # As long as the crunchstat poll interval (10s) is shorter than the
+  # squeue check interval (15s) this should make the squeue check an
+  # infrequent event.
+  my $silent_procs = 0;
+  for my $jobstep (values %proc)
   {
-    if (exists $proc{$_}->{killtime}
-	&& $proc{$_}->{killtime} <= time)
+    if ($jobstep->{stderr_at} < $last_squeue_check)
     {
-      killem ($_);
+      $silent_procs++;
     }
   }
+  return if $silent_procs == 0;
 
-  # return if the squeue was checked <60 seconds ago
-  if (defined $squeue_checked && $squeue_checked > time - 60)
+  # use killem() on procs whose killtime is reached
+  while (my ($pid, $jobstep) = each %proc)
   {
-    return;
+    if (exists $jobstep->{killtime}
+        && $jobstep->{killtime} <= time
+        && $jobstep->{stderr_at} < $last_squeue_check)
+    {
+      Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task disappeared from slurm queue)");
+      killem ($pid);
+    }
   }
-  $squeue_checked = time;
 
   if (!$have_slurm)
   {
@@ -1285,13 +1296,13 @@ sub check_squeue
   }
 
   # get a list of steps still running
-  my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
-  chop @squeue;
-  if ($squeue[-1] ne "ok")
+  my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOBID}\E --steps --format='%i %j' --noheader`;
+  if ($? != 0)
   {
+    Log(undef, "warning: squeue exit status $? ($!)");
     return;
   }
-  pop @squeue;
+  chop @squeue;
 
   # which of my jobsteps are running, according to squeue?
   my %ok;
@@ -1306,15 +1317,23 @@ sub check_squeue
     }
   }
 
-  # which of my active child procs (>60s old) were not mentioned by squeue?
-  foreach (keys %proc)
+  # Check for child procs >60s old and not mentioned by squeue.
+  while (my ($pid, $jobstep) = each %proc)
   {
-    if ($proc{$_}->{time} < time - 60
-	&& !exists $ok{$proc{$_}->{jobstepname}}
-	&& !exists $proc{$_}->{killtime})
+    if ($jobstep->{time} < time - 60
+        && $jobstep->{jobstepname}
+        && !exists $ok{$jobstep->{jobstepname}}
+        && !exists $jobstep->{killtime})
     {
-      # kill this proc if it hasn't exited in 30 seconds
-      $proc{$_}->{killtime} = time + 30;
+      # According to slurm, this task has ended (successfully or not)
+      # -- but our srun child hasn't exited. First we must wait (30
+      # seconds) in case this is just a race between communication
+      # channels. Then, if our srun child process still hasn't
+      # terminated, we'll conclude some slurm communication
+      # error/delay has caused the task to die without notifying srun,
+      # and we'll kill srun ourselves.
+      $jobstep->{killtime} = time + 30;
+      Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
     }
   }
 }
@@ -1339,6 +1358,7 @@ sub readfrompipes
     while (0 < sysread ($reader{$job}, $buf, 8192))
     {
       print STDERR $buf if $ENV{CRUNCH_DEBUG};
+      $jobstep[$job]->{stderr_at} = time;
       $jobstep[$job]->{stderr} .= $buf;
       preprocess_stderr ($job);
       if (length ($jobstep[$job]->{stderr}) > 16384)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list