[ARVADOS] created: 620e0a87fb5c9e798148b58c045e5bba7224cd03

git at public.curoverse.com git at public.curoverse.com
Sat Jan 23 01:55:03 EST 2016


        at  620e0a87fb5c9e798148b58c045e5bba7224cd03 (commit)


commit 620e0a87fb5c9e798148b58c045e5bba7224cd03
Author: Tom Clegg <tom at curoverse.com>
Date:   Sat Jan 23 01:47:48 2016 -0500

    7263: Fix confusion between %proc and %jobstep.
    
    $proc{$pid}->{jobstep} is an index into @jobstep
    $proc{$pid}->{jobstepname} is the name we told srun to use
    $proc{$pid}->{killtime} is a deadline when we should kill the process
    $jobstep[$jobstepid]->{stderr_at} is the time of last stderr received
    
    We were mistakenly using $proc->{$pid}->{stderr_at}, which was always
    undef and therefore always less than $last_squeue_check. This resulted
    in jobs being killed as "slurm orphans" when the real reason they
    hadn't been returned by waitpid() was that we hadn't finished
    consuming their stderr yet.

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 70d05f0..7c50c28 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1340,8 +1340,9 @@ sub check_squeue
   # squeue check interval (15s) this should make the squeue check an
   # infrequent event.
   my $silent_procs = 0;
-  for my $jobstep (values %proc)
+  for my $procinfo (values %proc)
   {
+    my $jobstep = $jobstep[$procinfo->{jobstep}];
     if ($jobstep->{stderr_at} < $last_squeue_check)
     {
       $silent_procs++;
@@ -1350,17 +1351,18 @@ sub check_squeue
   return if $silent_procs == 0;
 
   # use killem() on procs whose killtime is reached
-  while (my ($pid, $jobstep) = each %proc)
+  while (my ($pid, $procinfo) = each %proc)
   {
-    if (exists $jobstep->{killtime}
-        && $jobstep->{killtime} <= time
+    my $jobstep = $jobstep[$procinfo->{jobstep}];
+    if (exists $procinfo->{killtime}
+        && $procinfo->{killtime} <= time
         && $jobstep->{stderr_at} < $last_squeue_check)
     {
       my $sincewhen = "";
       if ($jobstep->{stderr_at}) {
         $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
       }
-      Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
+      Log($procinfo->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
       killem ($pid);
     }
   }
@@ -1395,12 +1397,12 @@ sub check_squeue
   }
 
   # Check for child procs >60s old and not mentioned by squeue.
-  while (my ($pid, $jobstep) = each %proc)
+  while (my ($pid, $procinfo) = each %proc)
   {
-    if ($jobstep->{time} < time - 60
-        && $jobstep->{jobstepname}
-        && !exists $ok{$jobstep->{jobstepname}}
-        && !exists $jobstep->{killtime})
+    if ($procinfo->{time} < time - 60
+        && $procinfo->{jobstepname}
+        && !exists $ok{$procinfo->{jobstepname}}
+        && !exists $procinfo->{killtime})
     {
       # According to slurm, this task has ended (successfully or not)
       # -- but our srun child hasn't exited. First we must wait (30
@@ -1409,8 +1411,8 @@ sub check_squeue
       # terminated, we'll conclude some slurm communication
       # error/delay has caused the task to die without notifying srun,
       # and we'll kill srun ourselves.
-      $jobstep->{killtime} = time + 30;
-      Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
+      $procinfo->{killtime} = time + 30;
+      Log($procinfo->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
     }
   }
 }

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list