[ARVADOS] updated: 3a8714e6fcf41c46d1fde0a6a3e4beb1367d181d
git at public.curoverse.com
git at public.curoverse.com
Sun Jan 24 19:48:23 EST 2016
Summary of changes:
discards 620e0a87fb5c9e798148b58c045e5bba7224cd03 (commit)
via 3a8714e6fcf41c46d1fde0a6a3e4beb1367d181d (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (620e0a87fb5c9e798148b58c045e5bba7224cd03)
\
N -- N -- N (3a8714e6fcf41c46d1fde0a6a3e4beb1367d181d)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit 3a8714e6fcf41c46d1fde0a6a3e4beb1367d181d
Author: Tom Clegg <tom at curoverse.com>
Date: Sun Jan 24 19:48:06 2016 -0500
8284: Fix confusion between %proc and %jobstep.
$proc{$pid}->{jobstep} is an index into @jobstep
$proc{$pid}->{jobstepname} is the name we told srun to use
$proc{$pid}->{killtime} is a deadline when we should kill the process
$jobstep[$jobstepid]->{stderr_at} is the time of last stderr received
We were mistakenly using $proc->{$pid}->{stderr_at}, which was always
undef and therefore always less than $last_squeue_check. This resulted
in jobs being killed as "slurm orphans" when the real reason they
hadn't been returned by waitpid() was that we hadn't finished
consuming their stderr yet.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 70d05f0..7c50c28 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -1340,8 +1340,9 @@ sub check_squeue
# squeue check interval (15s) this should make the squeue check an
# infrequent event.
my $silent_procs = 0;
- for my $jobstep (values %proc)
+ for my $procinfo (values %proc)
{
+ my $jobstep = $jobstep[$procinfo->{jobstep}];
if ($jobstep->{stderr_at} < $last_squeue_check)
{
$silent_procs++;
@@ -1350,17 +1351,18 @@ sub check_squeue
return if $silent_procs == 0;
# use killem() on procs whose killtime is reached
- while (my ($pid, $jobstep) = each %proc)
+ while (my ($pid, $procinfo) = each %proc)
{
- if (exists $jobstep->{killtime}
- && $jobstep->{killtime} <= time
+ my $jobstep = $jobstep[$procinfo->{jobstep}];
+ if (exists $procinfo->{killtime}
+ && $procinfo->{killtime} <= time
&& $jobstep->{stderr_at} < $last_squeue_check)
{
my $sincewhen = "";
if ($jobstep->{stderr_at}) {
$sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
}
- Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
+ Log($procinfo->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
killem ($pid);
}
}
@@ -1395,12 +1397,12 @@ sub check_squeue
}
# Check for child procs >60s old and not mentioned by squeue.
- while (my ($pid, $jobstep) = each %proc)
+ while (my ($pid, $procinfo) = each %proc)
{
- if ($jobstep->{time} < time - 60
- && $jobstep->{jobstepname}
- && !exists $ok{$jobstep->{jobstepname}}
- && !exists $jobstep->{killtime})
+ if ($procinfo->{time} < time - 60
+ && $procinfo->{jobstepname}
+ && !exists $ok{$procinfo->{jobstepname}}
+ && !exists $procinfo->{killtime})
{
# According to slurm, this task has ended (successfully or not)
# -- but our srun child hasn't exited. First we must wait (30
@@ -1409,8 +1411,8 @@ sub check_squeue
# terminated, we'll conclude some slurm communication
# error/delay has caused the task to die without notifying srun,
# and we'll kill srun ourselves.
- $jobstep->{killtime} = time + 30;
- Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
+ $procinfo->{killtime} = time + 30;
+ Log($procinfo->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
}
}
}
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list