[ARVADOS] updated: a1b5da5e536e8bfc58187965d11312d1fe883972
git at public.curoverse.com
git at public.curoverse.com
Mon Jun 1 09:54:53 EDT 2015
Summary of changes:
sdk/cli/bin/crunch-job | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
discards fc6473066977e983cddfc5455dde2d7816cc3d27 (commit)
via a1b5da5e536e8bfc58187965d11312d1fe883972 (commit)
This update added new revisions after undoing existing revisions. That is
to say, the old revision is not a strict subset of the new revision. This
situation occurs when you --force push a change and generate a repository
containing something like this:
* -- * -- B -- O -- O -- O (fc6473066977e983cddfc5455dde2d7816cc3d27)
\
N -- N -- N (a1b5da5e536e8bfc58187965d11312d1fe883972)
When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit a1b5da5e536e8bfc58187965d11312d1fe883972
Author: Tom Clegg <tom at curoverse.com>
Date: Sun May 31 02:02:19 2015 -0400
6146: Improvements to "kill srun process if slurm task disappears" feature:
* Log when we notice a process is orphaned.
* Log when we decide to kill an orphaned process.
* Use `squeue --jobs $SLURM_JOBID` so slurm doesn't have to tell us
about other jobs' tasks.
* Do not kill a process that is still reporting stderr.
* Do not check `squeue` at all if every process has reported stderr
since the last squeue check. (In such cases, it seems safe to assume
no children are hung/dead.)
* Use the same timer/interval (15 seconds) for both noticing and
killing orphaned processes.
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 76eb95d..c536da6 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
my @jobstep_done = ();
my @jobstep_tomerge = ();
my $jobstep_tomerge_level = 0;
-my $squeue_checked;
-my $squeue_kill_checked;
+my $squeue_checked = 0;
my $latest_refresh = scalar time;
@@ -1254,29 +1253,41 @@ sub check_refresh_wanted
sub check_squeue
{
- # return if the kill list was checked <4 seconds ago
- if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
- {
- return;
- }
- $squeue_kill_checked = time;
+ my $last_squeue_check = $squeue_checked;
- # use killem() on procs whose killtime is reached
- for (keys %proc)
+ # Do not call `squeue` or check the kill list more than once every
+ # 15 seconds.
+ return if $last_squeue_check > time - 15;
+ $squeue_checked = time;
+
+ # Look for children from which we haven't received stderr data since
+ # the last squeue check. If no such children exist, all procs are
+ # alive and there's no need to even look at squeue.
+ #
+ # As long as the crunchstat poll interval (10s) is shorter than the
+ # squeue check interval (15s) this should make the squeue check an
+ # infrequent event.
+ my $silent_procs = 0;
+ for my $jobstep (values %proc)
{
- if (exists $proc{$_}->{killtime}
- && $proc{$_}->{killtime} <= time)
+ if ($jobstep->{stderr_at} < $last_squeue_check)
{
- killem ($_);
+ $silent_procs++;
}
}
+ return if $silent_procs == 0;
- # return if the squeue was checked <60 seconds ago
- if (defined $squeue_checked && $squeue_checked > time - 60)
+ # use killem() on procs whose killtime is reached
+ while (my ($pid, $jobstep) = each %proc)
{
- return;
+ if (exists $jobstep->{killtime}
+ && $jobstep->{killtime} <= time
+ && $jobstep->{stderr_at} < $last_squeue_check)
+ {
+ Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task disappeared from slurm queue)");
+ killem ($pid);
+ }
}
- $squeue_checked = time;
if (!$have_slurm)
{
@@ -1285,13 +1296,13 @@ sub check_squeue
}
# get a list of steps still running
- my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
- chop @squeue;
- if ($squeue[-1] ne "ok")
+ my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOBID}\E --steps --format='%i %j' --noheader`;
+ if ($? != 0)
{
+ Log(undef, "warning: squeue exit status $? ($!)");
return;
}
- pop @squeue;
+ chop @squeue;
# which of my jobsteps are running, according to squeue?
my %ok;
@@ -1306,15 +1317,23 @@ sub check_squeue
}
}
- # which of my active child procs (>60s old) were not mentioned by squeue?
- foreach (keys %proc)
+ # Check for child procs >60s old and not mentioned by squeue.
+ while (my ($pid, $jobstep) = each %proc)
{
- if ($proc{$_}->{time} < time - 60
- && !exists $ok{$proc{$_}->{jobstepname}}
- && !exists $proc{$_}->{killtime})
+ if ($jobstep->{time} < time - 60
+ && $jobstep->{jobstepname}
+ && !exists $ok{$jobstep->{jobstepname}}
+ && !exists $jobstep->{killtime})
{
- # kill this proc if it hasn't exited in 30 seconds
- $proc{$_}->{killtime} = time + 30;
+ # According to slurm, this task has ended (successfully or not)
+ # -- but our srun child hasn't exited. First we must wait (30
+ # seconds) in case this is just a race between communication
+ # channels. Then, if our srun child process still hasn't
+ # terminated, we'll conclude some slurm communication
+ # error/delay has caused the task to die without notifying srun,
+ # and we'll kill srun ourselves.
+ $jobstep->{killtime} = time + 30;
+ Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
}
}
}
@@ -1339,6 +1358,7 @@ sub readfrompipes
while (0 < sysread ($reader{$job}, $buf, 8192))
{
print STDERR $buf if $ENV{CRUNCH_DEBUG};
+ $jobstep[$job]->{stderr_at} = time;
$jobstep[$job]->{stderr} .= $buf;
preprocess_stderr ($job);
if (length ($jobstep[$job]->{stderr}) > 16384)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list