[ARVADOS] created: fab4aefa018fee992e2ac4acf2ccad02cbf292b7

git at public.curoverse.com git at public.curoverse.com
Thu Dec 17 18:07:13 EST 2015


        at  fab4aefa018fee992e2ac4acf2ccad02cbf292b7 (commit)


commit fab4aefa018fee992e2ac4acf2ccad02cbf292b7
Author: Tom Clegg <tom at curoverse.com>
Date:   Thu Dec 17 18:07:00 2015 -0500

    7965: Fail orphaned jobs still marked "running" after reboot.

diff --git a/services/api/lib/crunch_dispatch.rb b/services/api/lib/crunch_dispatch.rb
index 5d598d4..a24e8ac 100644
--- a/services/api/lib/crunch_dispatch.rb
+++ b/services/api/lib/crunch_dispatch.rb
@@ -2,6 +2,7 @@ require 'open3'
 require 'shellwords'
 
 class CrunchDispatch
+  extend DbCurrentTime
   include ApplicationHelper
   include Process
 
@@ -194,7 +195,7 @@ class CrunchDispatch
     nodelist
   end
 
-  def fail_job job, message
+  def fail_job job, message, skip_lock: false
     $stderr.puts "dispatch: #{job.uuid}: #{message}"
     begin
       Log.new(object_uuid: job.uuid,
@@ -206,7 +207,7 @@ class CrunchDispatch
       $stderr.puts "dispatch: log.create failed"
     end
 
-    if not have_job_lock?(job)
+    if not skip_lock and not have_job_lock?(job)
       begin
         job.lock @authorizations[job.uuid].user.uuid
       rescue ArvadosModel::AlreadyLockedError
@@ -339,16 +340,7 @@ class CrunchDispatch
         raise "Unknown crunch_job_wrapper: #{Server::Application.config.crunch_job_wrapper}"
       end
 
-      if Server::Application.config.crunch_job_user
-        cmd_args.unshift("sudo", "-E", "-u",
-                         Server::Application.config.crunch_job_user,
-                         "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
-                         "PATH=#{ENV['PATH']}",
-                         "PERLLIB=#{ENV['PERLLIB']}",
-                         "PYTHONPATH=#{ENV['PYTHONPATH']}",
-                         "RUBYLIB=#{ENV['RUBYLIB']}",
-                         "GEM_PATH=#{ENV['GEM_PATH']}")
-      end
+      cmd_args = sudo_preface + cmd_args
 
       next unless get_authorization job
 
@@ -362,7 +354,7 @@ class CrunchDispatch
         # reasonable thing to do at this point.
         repo = Repository.where(name: job.repository).first
         if repo.nil? or repo.server_path.nil?
-          fail_job "Repository #{job.repository} not found under #{@repo_root}"
+          fail_job job, "Repository #{job.repository} not found under #{@repo_root}"
           next
         end
         ready &&= get_commit repo.server_path, job.script_version
@@ -810,6 +802,50 @@ class CrunchDispatch
     end
   end
 
+  def fail_jobs before: nil
+    act_as_system_user do
+      threshold = nil
+      if before == 'reboot'
+        boottime = nil
+        open('/proc/stat').map(&:split).each do |stat, t|
+          if stat == 'btime'
+            boottime = t
+          end
+        end
+        if not boottime
+          raise "Could not find btime in /proc/stat"
+        end
+        threshold = Time.at(boottime.to_i)
+      elsif before
+        threshold = Time.parse(before, Time.now)
+      else
+        threshold = db_current_time
+      end
+      Rails.logger.info "fail_jobs: threshold is #{threshold}"
+
+      if Rails.configuration.crunch_job_wrapper == :slurm_immediate
+        # [["slurm_job_id", "slurm_job_name"], ...]
+        squeue = File.popen(['squeue', '-h', '-o', '%i %j']).readlines.map do |line|
+          line.strip.split(' ', 2)
+        end
+      else
+        squeue = []
+      end
+
+      Job.running.where('started_at < ?', threshold).each do |job|
+        Rails.logger.debug "fail_jobs: #{job.uuid} started #{job.started_at}"
+        squeue.each do |slurm_id, slurm_name|
+          if slurm_name == job.uuid
+            Rails.logger.info "fail_jobs: scancel #{slurm_id} for #{job.uuid}"
+            scancel slurm_id
+          end
+        end
+        fail_job(job, "cleaned up stale job: started before #{threshold}",
+                 skip_lock: true)
+      end
+    end
+  end
+
   protected
 
   def have_job_lock?(job)
@@ -851,4 +887,24 @@ class CrunchDispatch
       running_job[:stderr_flushed_at] = Time.now
     end
   end
+
+  def scancel slurm_id
+    cmd = sudo_preface + ['scancel', slurm_id]
+    puts File.popen(cmd).read
+    if not $?.success?
+      Rails.logger.error "scancel #{slurm_id.shellescape}: $?"
+    end
+  end
+
+  def sudo_preface
+    return [] if not Server::Application.config.crunch_job_user
+    ["sudo", "-E", "-u",
+     Server::Application.config.crunch_job_user,
+     "LD_LIBRARY_PATH=#{ENV['LD_LIBRARY_PATH']}",
+     "PATH=#{ENV['PATH']}",
+     "PERLLIB=#{ENV['PERLLIB']}",
+     "PYTHONPATH=#{ENV['PYTHONPATH']}",
+     "RUBYLIB=#{ENV['RUBYLIB']}",
+     "GEM_PATH=#{ENV['GEM_PATH']}"]
+  end
 end
diff --git a/services/api/script/cancel_stale_jobs.rb b/services/api/script/cancel_stale_jobs.rb
deleted file mode 100755
index 4949ec0..0000000
--- a/services/api/script/cancel_stale_jobs.rb
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env ruby
-
-
-if ENV["CRUNCH_DISPATCH_LOCKFILE"]
-  lockfilename = ENV.delete "CRUNCH_DISPATCH_LOCKFILE"
-  lockfile = File.open(lockfilename, File::RDWR|File::CREAT, 0644)
-  unless lockfile.flock File::LOCK_EX|File::LOCK_NB
-    abort "Lock unavailable on #{lockfilename} - exit"
-  end
-end
-
-ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
-
-require File.dirname(__FILE__) + '/../config/boot'
-require File.dirname(__FILE__) + '/../config/environment'
-
-class CancelJobs
-  include ApplicationHelper
-
-  def cancel_stale_jobs
-    act_as_system_user do
-      Job.running.each do |jobrecord|
-        f = Log.where("object_uuid=?", jobrecord.uuid).limit(1).order("created_at desc").first
-        if f
-          age = (Time.now - f.created_at)
-          if age > 300
-            $stderr.puts "dispatch: failing orphan job #{jobrecord.uuid}, last log is #{age} seconds old"
-            # job is marked running, but not known to crunch-dispatcher, and
-            # hasn't produced any log entries for 5 minutes, so mark it as failed.
-            jobrecord.running = false
-            jobrecord.cancelled_at ||= Time.now
-            jobrecord.finished_at ||= Time.now
-            if jobrecord.success.nil?
-              jobrecord.success = false
-            end
-            jobrecord.save!
-          end
-        end
-      end
-    end
-  end
-end
-
-CancelJobs.new.cancel_stale_jobs
diff --git a/services/api/script/fail-jobs.rb b/services/api/script/fail-jobs.rb
new file mode 100755
index 0000000..fd9212c
--- /dev/null
+++ b/services/api/script/fail-jobs.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+opts = Trollop::options do
+  banner 'Fail jobs that have state=="Running".'
+  banner 'Options:'
+  opt(:before,
+      'fail only jobs that started before the given time (or "reboot")',
+      type: :string)
+end
+
+ENV["RAILS_ENV"] = ARGV[0] || ENV["RAILS_ENV"] || "development"
+require File.dirname(__FILE__) + '/../config/boot'
+require File.dirname(__FILE__) + '/../config/environment'
+require Rails.root.join('lib/crunch_dispatch.rb')
+
+CrunchDispatch.new.fail_jobs before: opts[:before]
diff --git a/services/api/test/fixtures/files/proc_stat b/services/api/test/fixtures/files/proc_stat
new file mode 100644
index 0000000..eac6c47
--- /dev/null
+++ b/services/api/test/fixtures/files/proc_stat
@@ -0,0 +1,14 @@
+cpu  1632063 14136 880034 1195938459 1041039 63 21266 52811 0 0
+cpu0 291707 2191 123004 199461836 32816 58 4488 13329 0 0
+cpu1 279247 2288 168096 199443605 20358 0 3320 7776 0 0
+cpu2 243805 1099 145178 199516577 19542 0 2656 6975 0 0
+cpu3 225772 3025 145032 199534463 21217 0 2260 6578 0 0
+cpu4 280505 2581 151177 198587478 885147 2 4446 10116 0 0
+cpu5 311025 2950 147545 199394498 61957 2 4093 8035 0 0
+intr 165887918 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8993615 9108946 8200 0 50911 0 12573182 7875376 8631 0 44633 0 10027365 7325091 8544 0 59992 0 9835855 6999541 8145 0 65576 0 9789778 8583897 8184 0 55917 0 10003804 8546910 8448 0 53484 0 463 150 3174990 11523 3836341 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ctxt 255708943
+btime 1448378837
+processes 924315
+procs_running 1
+procs_blocked 0
+softirq 105120691 0 21194262 1261637 20292759 0 0 40708 13638302 27046 48665977
diff --git a/services/api/test/unit/fail_jobs_test.rb b/services/api/test/unit/fail_jobs_test.rb
new file mode 100644
index 0000000..d084c63
--- /dev/null
+++ b/services/api/test/unit/fail_jobs_test.rb
@@ -0,0 +1,57 @@
+require 'test_helper'
+require 'crunch_dispatch'
+
+class FailJobsTest < ActiveSupport::TestCase
+  include DbCurrentTime
+
+  setup do
+    @job = {}
+    act_as_user users(:admin) do
+      @job[:before_reboot] = Job.create!(state: 'Running',
+                                         running: true,
+                                         started_at: Time.at(1448378000))
+      @job[:after_reboot] = Job.create!(state: 'Running',
+                                        running: true,
+                                        started_at: Time.at(1448379000))
+    end
+    @dispatch = CrunchDispatch.new
+  end
+
+  test 'cancel slurm jobs' do
+    Rails.configuration.crunch_job_wrapper = :slurm_immediate
+    Rails.configuration.crunch_job_user = 'foobar'
+    fake_squeue = File.popen("echo 1234 #{@job[:before_reboot].uuid}")
+    fake_scancel = File.popen("true")
+    File.expects(:popen).
+      with(['squeue', '-h', '-o', '%i %j']).
+      returns(fake_squeue)
+    File.expects(:popen).
+      with(includes('sudo', '-u', 'foobar', 'scancel', '1234')).
+      returns(fake_scancel)
+    @dispatch.fail_jobs(before: Time.at(1448378837).to_s)
+    @job.values.map &:reload
+    assert_equal 'Failed', @job[:before_reboot].state
+    assert_equal 'Running', @job[:after_reboot].state
+  end
+
+  test 'use reboot time' do
+    Rails.configuration.crunch_job_wrapper = nil
+    @dispatch.expects(:open).once.with('/proc/stat').
+      returns open(Rails.root.join('test/fixtures/files/proc_stat'))
+    t0 = db_current_time
+    @dispatch.fail_jobs(before: 'reboot')
+    t1 = db_current_time
+    @job.values.map &:reload
+    assert_equal 'Failed', @job[:before_reboot].state
+    assert_equal false, @job[:before_reboot].running
+    assert_equal false, @job[:before_reboot].success
+    assert_operator @job[:before_reboot].finished_at, :>=, t0
+    assert_operator @job[:before_reboot].finished_at, :<=, t1
+    assert_equal 'Running', @job[:after_reboot].state
+  end
+
+  test 'command line help' do
+    cmd = Rails.root.join('script/fail-jobs.rb').to_s
+    assert_match /Options:.*--before=/m, File.popen([cmd, '--help']).read
+  end
+end

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list