[ARVADOS] created: 1.1.4-669-gfebdebbb5

Git user git at public.curoverse.com
Fri Jul 20 16:46:38 EDT 2018


        at  febdebbb58592be73dcf7d4bd4b2c7ff96657741 (commit)


commit febdebbb58592be73dcf7d4bd4b2c7ff96657741
Author: Peter Amstutz <pamstutz at veritasgenetics.com>
Date:   Fri Jul 20 16:42:56 2018 -0400

    13546: crunch-job has timeout on srun_sync
    
    * Add global timeout to srun_sync, default 15 minutes, terminates
      job if a call to srun_sync exceeds the timeout.
    
    * Default can be adjusted by setting CRUNCH_SRUN_SYNC_TIMEOUT in the
      environment of crunch_dispatch.rb (value is in seconds)
    
    Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz at veritasgenetics.com>

diff --git a/sdk/cli/bin/crunch-job b/sdk/cli/bin/crunch-job
index 9343fcfbf..b8afe638a 100755
--- a/sdk/cli/bin/crunch-job
+++ b/sdk/cli/bin/crunch-job
@@ -132,6 +132,7 @@ my $resume_stash;
 my $cgroup_root = "/sys/fs/cgroup";
 my $docker_bin = "docker.io";
 my $docker_run_args = "";
+my $srun_sync_timeout = 15*60;
 GetOptions('force-unlock' => \$force_unlock,
            'git-dir=s' => \$git_dir,
            'job=s' => \$jobspec,
@@ -141,6 +142,7 @@ GetOptions('force-unlock' => \$force_unlock,
            'cgroup-root=s' => \$cgroup_root,
            'docker-bin=s' => \$docker_bin,
            'docker-run-args=s' => \$docker_run_args,
+           'srun-sync-timeout=i' => \$srun_sync_timeout,
     );
 
 if (defined $job_api_token) {
@@ -2007,6 +2009,8 @@ sub srun_sync
   my ($stdout_r, $stdout_w);
   pipe $stdout_r, $stdout_w or croak("pipe() failed: $!");
 
+  my $started_srun = scalar time;
+
   my $srunpid = fork();
   if ($srunpid == 0)
   {
@@ -2050,6 +2054,12 @@ sub srun_sync
     if (!$busy) {
       select(undef, undef, undef, 0.1);
     }
+    if (($started_srun + $srun_sync_timeout) < scalar time) {
+      # Exceeded general timeout for "srun_sync" operations, likely
+      # means something got stuck on the remote node.
+      Log(undef, "srun_sync exceeded timeout, will fail.");
+      $main::please_freeze = 1;
+    }
     killem(keys %proc) if $main::please_freeze;
   }
   my $exited = $?;
diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 73ad7606c..449d7d516 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -29,6 +29,7 @@ class CrunchDispatch
     @docker_bin = ENV['CRUNCH_JOB_DOCKER_BIN']
     @docker_run_args = ENV['CRUNCH_JOB_DOCKER_RUN_ARGS']
     @cgroup_root = ENV['CRUNCH_CGROUP_ROOT']
+    @srun_sync_timeout = ENV['CRUNCH_SRUN_SYNC_TIMEOUT']
 
     @arvados_internal = Rails.configuration.git_internal_dir
     if not File.exist? @arvados_internal
@@ -419,6 +420,10 @@ class CrunchDispatch
         cmd_args += ['--docker-run-args', @docker_run_args]
       end
 
+      if @srun_sync_timeout
+        cmd_args += ['--srun-sync-timeout', @srun_sync_timeout]
+      end
+
       if have_job_lock?(job)
         cmd_args << "--force-unlock"
       end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list