[ARVADOS] created: 1.1.4-669-gfebdebbb5
Git user
git at public.curoverse.com
Fri Jul 20 16:46:38 EDT 2018
at febdebbb58592be73dcf7d4bd4b2c7ff96657741 (commit)
commit febdebbb58592be73dcf7d4bd4b2c7ff96657741
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date: Fri Jul 20 16:42:56 2018 -0400
13546: crunch-job has timeout on srun_sync
* Add global timeout to srun_sync, default 15 minutes, terminates
job if a call to srun_sync exceeds the timeout.
* Default can be adjusted by setting CRUNCH_SRUN_SYNC_TIMEOUT in the
environment of crunch_dispatch.rb (value is in seconds)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>
diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 9343fcfbf..b8afe638a 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -132,6 +132,7 @@ my $resume_stash;
my $cgroup_root = "/sys/fs/cgroup";
my $docker_bin = "docker.io";
my $docker_run_args = "";
+my $srun_sync_timeout = 15*60;
GetOptions('force-unlock' => \$force_unlock,
'git-dir=s' => \$git_dir,
'job=s' => \$jobspec,
@@ -141,6 +142,7 @@ GetOptions('force-unlock' => \$force_unlock,
'cgroup-root=s' => \$cgroup_root,
'docker-bin=s' => \$docker_bin,
'docker-run-args=s' => \$docker_run_args,
+ 'srun-sync-timeout=i' => \$srun_sync_timeout,
);
if (defined $job_api_token) {
@@ -2007,6 +2009,8 @@ sub srun_sync
my ($stdout_r, $stdout_w);
pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
+ my $started_srun = scalar time;
+
my $srunpid = fork();
if ($srunpid == 0)
{
@@ -2050,6 +2054,12 @@ sub srun_sync
if (!$busy) {
select(undef, undef, undef, 0.1);
}
+ if (($started_srun + $srun_sync_timeout) < scalar time) {
+ # Exceeded general timeout for "srun_sync" operations, likely
+ # means something got stuck on the remote node.
+ Log(undef, "srun_sync exceeded timeout, will fail.");
+ $main::please_freeze = 1;
+ }
killem(keys %proc) if $main::please_freeze;
}
my $exited = $?;
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 73ad7606c..449d7d516 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -29,6 +29,7 @@ class CrunchDispatch
@docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
@docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
@cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
+ @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
@arvados_internal = Rails.configuration.git_internal_dir
if not File.exist? @arvados_internal
@@ -419,6 +420,10 @@ class CrunchDispatch
cmd_args += ['--docker-run-args', @docker_run_args]
end
+ if @srun_sync_timeout
+ cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
+ end
+
if have_job_lock?(job)
cmd_args << "--force-unlock"
end
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list