[ARVADOS] updated: 2b34839cdf95291a7356554e05e50b9ced177dd6

Git user git at public.curoverse.com
Tue Feb 21 13:30:39 EST 2017


Summary of changes:
 services/api/lib/crunch_dispatch.rb            | 55 +++++++++++++++++++-------
 services/api/test/unit/crunch_dispatch_test.rb | 19 +++++++++
 2 files changed, 60 insertions(+), 14 deletions(-)

       via  2b34839cdf95291a7356554e05e50b9ced177dd6 (commit)
      from  f29e958f50a914504f971d344c93ee7297d77fbb (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit 2b34839cdf95291a7356554e05e50b9ced177dd6
Author: radhika <radhika at curoverse.com>
Date:   Tue Feb 21 13:29:58 2017 -0500

    10979: scancel orphaned job nodes in crunch1.

diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 18a4250..34fed92 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -832,6 +832,9 @@ class CrunchDispatch
         unless (@todo_pipelines.empty? and @pipe_auth_tokens.empty?) or did_recently(:update_pipelines, 5.0)
           update_pipelines
         end
+        unless did_recently('check_orphaned_slurm_jobs', 60)
+          check_orphaned_slurm_jobs
+        end
       end
       reap_children
       select(@running.values.collect { |j| [j[:stdout], j[:stderr]] }.flatten,
@@ -892,6 +895,39 @@ class CrunchDispatch
     end
   end
 
+  def check_orphaned_slurm_jobs
+    act_as_system_user do
+      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+        squeue_uuids = File.popen(['squeue', '-a', '-h', '-o', '%j']).readlines.map do |line|
+          line.strip.split(' ', 1)
+        end.collect{|l| l[0]}.
+            select{|uuid| uuid.match(HasUuid::UUID_REGEX)}.
+            select{|uuid| !@running.has_key?(uuid)}
+
+        return if squeue_uuids.size == 0
+
+        scancel_uuids = squeue_uuids - Job.where('uuid in (?) and (state=? or modified_at>?)',
+                                                 squeue_uuids, 'Running', (Time.now - 60)).collect(&:uuid)
+        scancel_uuids.each do |uuid|
+          Rails.logger.info "orphaned job: scancel #{uuid}"
+          scancel uuid, true
+        end
+      end
+    end
+  end
+
+  def sudo_preface
+    return [] if not Server::Application.config.crunch_job_user
+    ["sudo", "-E", "-u",
+     Server::Application.config.crunch_job_user,
+     "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
+     "PATH=#{ENV['PATH']}",
+     "PERLLIB=#{ENV['PERLLIB']}",
+     "PYTHONPATH=#{ENV['PYTHONPATH']}",
+     "RUBYLIB=#{ENV['RUBYLIB']}",
+     "GEM_PATH=#{ENV['GEM_PATH']}"]
+  end
+
   protected
 
   def have_job_lock?(job)
@@ -934,23 +970,14 @@ class CrunchDispatch
     end
   end
 
-  def scancel slurm_id
-    cmd = sudo_preface + ['scancel', slurm_id]
+  def scancel slurm_id, use_name=false
+    scancel_cmd = ['scancel']
+    scancel_cmd << '-n' if use_name
+    scancel_cmd << slurm_id
+    cmd = sudo_preface + scancel_cmd
     puts File.popen(cmd).read
     if not $?.success?
       Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
     end
   end
-
-  def sudo_preface
-    return [] if not Server::Application.config.crunch_job_user
-    ["sudo", "-E", "-u",
-     Server::Application.config.crunch_job_user,
-     "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
-     "PATH=#{ENV['PATH']}",
-     "PERLLIB=#{ENV['PERLLIB']}",
-     "PYTHONPATH=#{ENV['PYTHONPATH']}",
-     "RUBYLIB=#{ENV['RUBYLIB']}",
-     "GEM_PATH=#{ENV['GEM_PATH']}"]
-  end
 end
diff --git a/services/api/test/unit/crunch_dispatch_test.rb b/services/api/test/unit/crunch_dispatch_test.rb
index 6233fb4..8e604ee 100644
--- a/services/api/test/unit/crunch_dispatch_test.rb
+++ b/services/api/test/unit/crunch_dispatch_test.rb
@@ -202,4 +202,23 @@ class CrunchDispatchTest < ActiveSupport::TestCase
       assert_equal 5, job[:log_throttle_lines_so_far]
     end
   end
+
+  test 'scancel orphaned job nodes' do
+    Rails.configuration.crunch_job_wrapper = :slurm_immediate
+    act_as_system_user do
+      dispatch = CrunchDispatch.new
+
+      squeue_resp = File.popen("echo zzzzz-8i9sb-pshmckwoma9plh7\necho thisisnotvalidjobuuid\necho zzzzz-8i9sb-4cf0abc123e809j\n")
+
+      File.expects(:popen).
+        with(['squeue', '-a', '-h', '-o', '%j']).
+        returns(squeue_resp)
+
+      File.expects(:popen).
+        with(dispatch.sudo_preface + ['scancel', '-n', 'zzzzz-8i9sb-4cf0abc123e809j']).
+        returns(squeue_resp)
+
+      dispatch.check_orphaned_slurm_jobs
+    end
+  end
 end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list